-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_mueller_pdf.py
41 lines (33 loc) · 1.04 KB
/
extract_mueller_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
'''
Script to parse Mueller PDF report - uses PDFMiner
'''
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
def pdfparser(data):
'''
:param data: pdf file to read and extract text from
:return:
'''
result = ''
fp = file(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
temp = retstr.getvalue()
result += temp
return(result)
data = 'mueller-report-searchable.pdf'
result = pdfparser(data=data)
if __name__ == '__main__':
pdfparser(sys.argv[1])