I need help to achieve a mapping between text and image objects in a PDF document.
As the first figure shows, my PDF documents have 3 images arranged randomly in the y-direction. To the left of them are texts. The texts extend along the height of the images.
My goal is to combine the texts into "ImObj" objects (see the class ImObj).
The 2nd figure shows that I want to use the height of the image to detect the position of the texts (all texts outside of the image height should be ignored). In the example, there will be 3 ImObj-objects formed by the 3 images.
The link to the pdf file is here (on wetransfer): [enter link description here][3]
But my mapping does not work, because I probably use the wrong coordinates from the image. Now I have already looked at some examples, but I still don't really understand how to get the coordinates of text and images working together? Here is my code:
import java.awt.Image;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
public class ImExample extends PDFTextStripper {
public static void main(String[] args) {
File file = new File("C://example document.pdf");
try {
PDDocument document = PDDocument.load(file);
ImExample example = new ImExample();
for (int pnr = 0; pnr < document.getPages().getCount(); pnr++) {
PDPage page = document.getPages().get(pnr);
PDResources res = page.getResources();
example.processPage(page);
int idx = 0;
for (COSName objName : res.getXObjectNames()) {
PDXObject xObj = res.getXObject(objName);
if (xObj instanceof PDImageXObject) {
System.out.println("...add a new image");
PDImageXObject imXObj = (PDImageXObject) xObj;
BufferedImage image = imXObj.getImage();
// Here is my mistake ... but I do not know how to solve it.
ImObj imObj = new ImObj(image, idx++, pnr, image.getMinY(), image.getMinY() + image.getHeight());
example.imObjects.add(imObj);
}
}
}
example.setSortByPosition(true);
example.getText(document);
// Output
for (ImObj iObj : example.imObjects)
System.out.println(iObj.idx + " -> " + iObj.text);
document.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public List<ImObj> imObjects = new ArrayList<ImObj>();
public ImExample() throws IOException {
super();
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
// match between imagesize and textposition
TextPosition txtPos = textPositions.get(0);
for (ImObj im : imObjects) {
if(im.page == (this.getCurrentPageNo()-1))
if (im.minY < txtPos.getY() && (txtPos.getY() + txtPos.getHeight()) < im.maxY)
im.text.append(text + " ");
}
}
}
class ImObj {
float minY, maxY;
Image image = null;
StringBuilder text = new StringBuilder("");
int idx, page = 0;
public ImObj(Image im, int idx, int pnr, float yMin, float yMax) {
this.idx = idx;
this.image = im;
this.minY = yMin;
this.maxY = yMax;
this.page = pnr;
}
}
Best regards