当前位置: 技术问答>java相关
如何从Java程序读取PDF文档中的文本信息?
来源: 互联网 发布时间:2015-05-09
本文导语: 最好可以有一些程序片断。 | 以前看到的,不知道有没有价值 Re: How to read the a PDF file content using Ethymon PJ? Body: try this code : import java.io.*; import java.util.*; import com.etymon.pj.*; impor...
最好可以有一些程序片断。
|
以前看到的,不知道有没有价值
Re: How to read the a PDF file content using Ethymon PJ?
Body: try this code :
import java.io.*;
import java.util.*;
import com.etymon.pj.*;
import com.etymon.pj.object.*;
import com.etymon.pj.exception.*;
/**
* This is a wrapper for the Pj PDF parser
*/
public class PjWrapper {
Pdf pdf;
PjCatalog catalog;
PjPagesNode rootPage;
public PjWrapper(String PdfFileName,String TextFileName)throws
IOException, PjException {
pdf = new Pdf(PdfFileName);
// hopefully the catalog can never be a reference...
catalog = (PjCatalog) pdf.getObject(pdf.getCatalog());
// root node of pages tree is specified by a reference in the catalog
rootPage = (PjPagesNode) pdf.resolve(catalog.getPages());
}
public static void main (String [] args) throws IOException, PjException
{
/*PjWrapper testWrapper = new PjWrapper(args[0]);
LinkedList textList = testWrapper.getAllText();*/
}
/**
* Returns as much text as we can extract from the PDF.
* This currently includes:
*
* NOTE: Pj does not support LZW, so some text in some PDF's may not
* be indexable
*/
public LinkedList getAllText() throws PjException {
LinkedList stringList = new LinkedList();
Iterator streamIter = getAllContentsStreams().iterator();
PjStream stream;
String streamData;
String streamText;
boolean moreData;
int textStart, textEnd;
//System.out.println("Going through streams...");
while(streamIter.hasNext()) {
//System.out.println("Getting next stream");
stream = (PjStream) streamIter.next();
//System.out.println("Adding text from stream with filter: "
+getFilterString(stream);
stream = stream.flateDecompress();
//System.out.println("Adding text from stream with filter
afterdecompress: " + getFilterString(stream));
streamData = new String(stream.getBuffer());
streamText = new String();
moreData = true;
textStart = textEnd = 0;
while(moreData) {
if ((textStart = streamData.indexOf('(', textEnd + 1))
Re: How to read the a PDF file content using Ethymon PJ?
Body: try this code :
import java.io.*;
import java.util.*;
import com.etymon.pj.*;
import com.etymon.pj.object.*;
import com.etymon.pj.exception.*;
/**
* This is a wrapper for the Pj PDF parser
*/
public class PjWrapper {
Pdf pdf;
PjCatalog catalog;
PjPagesNode rootPage;
public PjWrapper(String PdfFileName,String TextFileName)throws
IOException, PjException {
pdf = new Pdf(PdfFileName);
// hopefully the catalog can never be a reference...
catalog = (PjCatalog) pdf.getObject(pdf.getCatalog());
// root node of pages tree is specified by a reference in the catalog
rootPage = (PjPagesNode) pdf.resolve(catalog.getPages());
}
public static void main (String [] args) throws IOException, PjException
{
/*PjWrapper testWrapper = new PjWrapper(args[0]);
LinkedList textList = testWrapper.getAllText();*/
}
/**
* Returns as much text as we can extract from the PDF.
* This currently includes:
*
* NOTE: Pj does not support LZW, so some text in some PDF's may not
* be indexable
*/
public LinkedList getAllText() throws PjException {
LinkedList stringList = new LinkedList();
Iterator streamIter = getAllContentsStreams().iterator();
PjStream stream;
String streamData;
String streamText;
boolean moreData;
int textStart, textEnd;
//System.out.println("Going through streams...");
while(streamIter.hasNext()) {
//System.out.println("Getting next stream");
stream = (PjStream) streamIter.next();
//System.out.println("Adding text from stream with filter: "
+getFilterString(stream);
stream = stream.flateDecompress();
//System.out.println("Adding text from stream with filter
afterdecompress: " + getFilterString(stream));
streamData = new String(stream.getBuffer());
streamText = new String();
moreData = true;
textStart = textEnd = 0;
while(moreData) {
if ((textStart = streamData.indexOf('(', textEnd + 1))