Skip to content

Commit

Permalink
Use bz2.open.
Browse files Browse the repository at this point in the history
  • Loading branch information
attardi committed Dec 5, 2020
1 parent a2e078f commit 3150f60
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 7 deletions.
4 changes: 2 additions & 2 deletions wikiextractor/WikiExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
# ===========================================================================

# Program version
__version__ = '3.0.3'
__version__ = '3.0.4'

##
# Defined in <siteinfo>
Expand Down Expand Up @@ -266,7 +266,7 @@ def decode_open(filename, mode='rt', encoding='utf-8'):
ext = os.path.splitext(filename)[1]
if ext == '.gz':
import gzip
return gzip.open(filename, mode)
return gzip.open(filename, mode, encoding=encoding)
elif ext == '.bz2':
return bz2.open(filename, mode=mode, encoding=encoding)
else:
Expand Down
11 changes: 6 additions & 5 deletions wikiextractor/extractPage.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@


# Program version
__version__ = '3.0.3'
__version__ = '3.0.4'

# ----------------------------------------------------------------------
# READER
Expand All @@ -49,13 +49,14 @@ def process_data(input_file, id, templates=False):
:param id: article id
"""

opener = bz2.BZ2File if input_file.lower().endswith("bz2") else open

input = opener(input_file)
if input_file.lower().endswith(".bz2"):
input = bz2.open(input_file, mode='rt', encoding='utf-8')
else:
input = open(input_file)

page = []
for line in input:
line = line.decode('utf-8')
line = line
if '<' not in line: # faster than doing re.search()
if page:
page.append(line)
Expand Down

0 comments on commit 3150f60

Please sign in to comment.