Skip to content

Commit

Permalink
perf: read files as you process them, not up front
Browse files Browse the repository at this point in the history
  • Loading branch information
joanise committed Jul 5, 2024
1 parent d4bffad commit 9f4ab1a
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 32 deletions.
70 changes: 38 additions & 32 deletions g2p/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,39 +537,45 @@ def convert( # noqa: C901
and input_text.endswith("txt")
or re.match(r"/dev/(fd/[0-9]*|stdin)", input_text)
)
if input_text_is_a_file:
with open(input_text, encoding="utf8") as f:
lines = f.readlines()
elif input_text == "-":
lines = sys.stdin.readlines()
else:
lines = [input_text]
# Determine which tokenizer to use, if any
if tok is not None and not tok and tok_lang is not None:
raise click.UsageError("Specified conflicting --no-tok and --tok-lang options.")
if tok is None:
tok = True # Tokenize by default
custom_tokenizer = make_tokenizer(tok_lang) if tok_lang else None
# Transduce!!!
assert in_lang and out_lang
transducer = make_g2p(
in_lang, out_lang, tokenize=tok, custom_tokenizer=custom_tokenizer
)
for line in lines:
tg = transducer(line)
if check:
transducer.check(tg, display_warnings=True)
outputs = [tg.output_string]
if substring_alignments:
outputs += [tg.substring_alignments()]
if pretty_edges:
outputs += [tg.pretty_edges()]
if debugger:
outputs += [tg.edges, tg.debugger]
if len(outputs) > 1:
click.echo(pprint.pformat(outputs, indent=4))
to_close = None
try:
if input_text_is_a_file:
to_close = lines = open(input_text, encoding="utf8")
elif input_text == "-":
lines = sys.stdin

Check warning on line 545 in g2p/cli.py

View check run for this annotation

Codecov / codecov/patch

g2p/cli.py#L545

Added line #L545 was not covered by tests
else:
click.echo(tg.output_string, nl=not input_text_is_a_file)
lines = [input_text]
# Determine which tokenizer to use, if any
if tok is not None and not tok and tok_lang is not None:
raise click.UsageError(
"Specified conflicting --no-tok and --tok-lang options."
)
if tok is None:
tok = True # Tokenize by default
custom_tokenizer = make_tokenizer(tok_lang) if tok_lang else None
# Transduce!!!
assert in_lang and out_lang
transducer = make_g2p(
in_lang, out_lang, tokenize=tok, custom_tokenizer=custom_tokenizer
)
for line in lines:
tg = transducer(line)
if check:
transducer.check(tg, display_warnings=True)
outputs = [tg.output_string]
if substring_alignments:
outputs += [tg.substring_alignments()]
if pretty_edges:
outputs += [tg.pretty_edges()]
if debugger:
outputs += [tg.edges, tg.debugger]
if len(outputs) > 1:
click.echo(pprint.pformat(outputs, indent=4))
else:
click.echo(tg.output_string, nl=not input_text_is_a_file)
finally:
if to_close is not None:
to_close.close()


# Note: with -m eng-ipa, we actually check all the mappings from lang-ipa to eng-ipa.
Expand Down
4 changes: 4 additions & 0 deletions g2p/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,10 @@ def test_convert_from_file(self):
self.assertIn("fʁɑ̃sɛ", results.output)
with open(input_file, "r", encoding="utf8") as f:
lines_in = len(list(f))
# Make sure there is no resource warning about unclosed files
self.assertNotIn("ResourceWarning", results.output)
self.assertNotIn("unclose file", results.output)
# The output should have the same number of lines as the input
self.assertEqual(lines_in, len(results.output.splitlines()))

def test_convert_errors(self):
Expand Down

0 comments on commit 9f4ab1a

Please sign in to comment.