Skip to content

Commit

Permalink
Fixed handling of templates.
Browse files Browse the repository at this point in the history
  • Loading branch information
attardi committed Jan 24, 2023
1 parent f0ca16c commit 8f1b434
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 17 deletions.
20 changes: 10 additions & 10 deletions wikiextractor/WikiExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-

# =============================================================================
# Version: 3.0 (July 22, 2020)
# Version: 3.0 (January 24, 2023)
# Author: Giuseppe Attardi ([email protected]), University of Pisa
#
# Contributors:
Expand All @@ -17,7 +17,7 @@
# Nick Ulven (nulven@github)
#
# =============================================================================
# Copyright (c) 2009-2020. Giuseppe Attardi ([email protected]).
# Copyright (c) 2009-2023. Giuseppe Attardi ([email protected]).
# =============================================================================
# This file is part of Tanl.
#
Expand Down Expand Up @@ -68,7 +68,7 @@
# ===========================================================================

# Program version
__version__ = '3.0.6'
__version__ = '3.0.7'

##
# Defined in <siteinfo>
Expand Down Expand Up @@ -194,6 +194,7 @@ def load_templates(file, output_file=None):
"""
Load templates from :param file:.
:param output_file: file where to save templates and modules.
:return: number of templates loaded.
"""
global templateNamespace
global moduleNamespace, modulePrefix
Expand Down Expand Up @@ -335,14 +336,16 @@ def collect_pages(text):


def process_dump(input_file, template_file, out_file, file_size, file_compress,
process_count, html_safe):
process_count, html_safe, expand_templates=True):
"""
:param input_file: name of the wikipedia dump file; '-' to read from stdin
:param template_file: optional file with template definitions.
:param out_file: directory where to store extracted data, or '-' for stdout
:param file_size: max size of each extracted file, or None for no max (one file)
:param file_compress: whether to compress files with bzip.
:param process_count: number of extraction processes to spawn.
:html_safe: whether to convert entities in text to HTML.
:param expand_templates: whether to expand templates.
"""
global knownNamespaces
global templateNamespace
Expand Down Expand Up @@ -528,7 +531,7 @@ def reduce_process(output_queue, output):

def main():
global acceptedNamespaces
global expand_templates, templateCache
global templateCache

parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
Expand All @@ -555,7 +558,7 @@ def main():
help="accepted namespaces")
groupP.add_argument("--templates",
help="use or create file containing templates")
groupP.add_argument("--no-templates", action="store_false",
groupP.add_argument("--no-templates", action="store_true",
help="Do not expand templates")
groupP.add_argument("--html-safe", default=True,
help="use to produce HTML safe output within <doc>...</doc>")
Expand All @@ -582,8 +585,6 @@ def main():
Extractor.keepLinks = True
Extractor.to_json = args.json

expand_templates = args.no_templates

try:
power = 'kmg'.find(args.bytes[-1].lower()) + 1
# 0 bytes means put a single article per file.
Expand Down Expand Up @@ -636,8 +637,7 @@ def main():
return

process_dump(input_file, args.templates, output_path, file_size,
args.compress, args.processes, args.html_safe)

args.compress, args.processes, args.html_safe, not args.no_templates)

if __name__ == '__main__':
main()
17 changes: 10 additions & 7 deletions wikiextractor/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
from html.entities import name2codepoint
import logging
import time
import pdb # DEBUG

# ----------------------------------------------------------------------

Expand Down Expand Up @@ -82,7 +81,6 @@ def clean(extractor, text, expand_templates=False, html_safe=True):
if expand_templates:
# expand templates
# See: http://www.mediawiki.org/wiki/Help:Templates
pdb.set_trace() # DEBUG
text = extractor.expandTemplates(text)
else:
# Drop transclusions (template, parser functions)
Expand Down Expand Up @@ -830,7 +828,7 @@ def subst(self, params, extractor, depth=0):
# {{ppp|q=r|p=q}} gives r, but using Template:tvvv containing
# "{{{{{{{{{p}}}}}}}}}", {{tvvv|p=q|q=r|r=s}} gives s.

#logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self)
logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self)

if depth > extractor.maxParameterRecursionLevels:
extractor.recursion_exceeded_3_errs += 1
Expand Down Expand Up @@ -952,6 +950,7 @@ def clean_text(self, text, mark_headers=False, expand_templates=True,
e.g. "## Section 1"
"""
self.magicWords['namespace'] = self.title[:max(0, self.title.find(":"))]
#self.magicWords['namespacenumber'] = '0' # for article,
self.magicWords['pagename'] = self.title
self.magicWords['fullpagename'] = self.title
self.magicWords['currentyear'] = time.strftime('%Y')
Expand Down Expand Up @@ -1008,7 +1007,7 @@ def extract(self, out, html_safe=True):
# Expand templates

maxTemplateRecursionLevels = 30
maxParameterRecursionLevels = 10
maxParameterRecursionLevels = 16

# check for template beginning
reOpen = re.compile('(?<!{){{(?!{)', re.DOTALL)
Expand Down Expand Up @@ -1764,13 +1763,17 @@ def sharp_invoke(module, function, frame):

'int': lambda string, *rest: str(int(string)),

'padleft': lambda char, width, string: string.ljust(char, int(pad)), # CHECK_ME

}


def callParserFunction(functionName, args, frame):
"""
Parser functions have similar syntax as templates, except that
the first argument is everything after the first colon.
:param functionName: nameof the parser function
:param args: the arguments to the function
:return: the result of the invocation, None in case of failure.
http://meta.wikimedia.org/wiki/Help:ParserFunctions
Expand All @@ -1780,11 +1783,11 @@ def callParserFunction(functionName, args, frame):
if functionName == '#invoke':
# special handling of frame
ret = sharp_invoke(args[0].strip(), args[1].strip(), frame)
# logging.debug('parserFunction> %s %s', functionName, ret)
# logging.debug('parserFunction> %s %s', args[1], ret)
return ret
if functionName in parserFunctions:
ret = parserFunctions[functionName](*args)
# logging.debug('parserFunction> %s %s', functionName, ret)
# logging.debug('parserFunction> %s(%s) %s', functionName, args, ret)
return ret
except:
return "" # FIXME: fix errors
Expand Down Expand Up @@ -1851,6 +1854,6 @@ def define_template(title, page):
text = reIncludeonly.sub('', text)

if text:
if title in templates:
if title in templates and templates[title] != text:
logging.warn('Redefining: %s', title)
templates[title] = text

0 comments on commit 8f1b434

Please sign in to comment.