For python version 3.6, install pdfminer.six or pdfminer3k using pip install
from io
import StringIO
from pdfminer.pdfinterp
import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter
import TextConverter
from pdfminer.layout
import LAParams
from pdfminer.pdfpage
import PDFPage
def convert_pdf_to_text(fname, pages=
None):
if not pages:
pagenums =
set()
else:
pagenums =
set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile =
open(fname, '
rb')
for page
in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
pdf_text = convert_pdf_to_text("
1DD76387.pdf")
print(pdf_text)
----------------------------------------------------------------------------------------------------------------
from pdfminer.pdfinterp
import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter
import HTMLConverter
from pdfminer.layout
import LAParams
from pdfminer.pdfpage
import PDFPage
from io
import BytesIO
def convert_pdf_to_html(path):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
codec = '
utf-8'
laparams = LAParams()
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp =
open(path, '
rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages =
0 #is for all
caching =
True
pagenos=
set()
for page
in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
test = convert_pdf_to_html('
1DD76387.pdf')