Skip to content

Commit

Permalink
✨ Add {m} and {n,m} in PythonRegex
Browse files Browse the repository at this point in the history
  • Loading branch information
Aunsiels committed Mar 18, 2024
1 parent e7214f7 commit 61ac29b
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 4 deletions.
69 changes: 66 additions & 3 deletions pyformlang/regular_expression/python_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def _recombine(regex_to_recombine):
if regex_to_recombine[idx] == "\\x" and idx < len(regex_to_recombine) - 2 \
and regex_to_recombine[idx + 1] in HEXASTRING \
and regex_to_recombine[idx + 2] in HEXASTRING:
next_str = "".join(regex_to_recombine[idx+1:idx+3])
next_str = "".join(regex_to_recombine[idx + 1:idx + 3])
s_trans = chr(int(next_str, 16))
temp.append(TRANSFORMATIONS.get(s_trans, s_trans))
idx += 3
Expand All @@ -197,12 +197,12 @@ def _recombine(regex_to_recombine):
temp.append(TRANSFORMATIONS.get(name, name))
idx = idx_end + 1
elif regex_to_recombine[idx] == "\\u":
unicode_str = "".join(regex_to_recombine[idx+1: idx+5])
unicode_str = "".join(regex_to_recombine[idx + 1: idx + 5])
decoded = chr(int(unicode_str, 16))
temp.append(TRANSFORMATIONS.get(decoded, decoded))
idx = idx + 5
elif regex_to_recombine[idx] == "\\U":
unicode_str = "".join(regex_to_recombine[idx+1: idx+9])
unicode_str = "".join(regex_to_recombine[idx + 1: idx + 9])
decoded = chr(int(unicode_str, 16))
temp.append(TRANSFORMATIONS.get(decoded, decoded))
idx = idx + 9
Expand Down Expand Up @@ -291,8 +291,71 @@ def _preprocess_positive_closure(self):
for j in range(pos_opening, len(regex_temp)):
regex_temp.append(regex_temp[j])
regex_temp.append("*")
regex_temp = self._add_repetition(regex_temp)
self._python_regex = "".join(regex_temp)

@staticmethod
def _is_repetition(regex_list, idx):
if regex_list[idx] == "{":
end = idx
for i in range(idx + 1, len(regex_list)):
if regex_list[i] == "}":
end = i
break
inner = "".join(regex_list[idx + 1:end])
if "," in inner:
split = inner.split(",")
if len(split) != 2 or not split[0].isdigit() or not split[1].isdigit():
return None
return int(split[0]), int(split[1]), end
if inner.isdigit():
return int(inner), end
return None

@staticmethod
def _find_repeated_sequence(regex_list):
if regex_list[-1] != ")":
return [regex_list[-1]]
res = [")"]
counter = -1
for i in range(len(regex_list) - 2, -1, -1):
if regex_list[i] == "(":
counter += 1
res.append("(")
if counter == 0:
return res[::-1]
elif regex_list[i] == ")":
counter -= 1
res.append(")")
else:
res.append(regex_list[i])
return []

def _add_repetition(self, regex_list):
res = []
idx = 0
while idx < len(regex_list):
rep = self._is_repetition(regex_list, idx)
if rep is None:
res.append(regex_list[idx])
idx += 1
elif len(rep) == 2:
n_rep, end = rep
repeated = self._find_repeated_sequence(res)
for _ in range(n_rep - 1):
res.extend(repeated)
idx = end + 1
elif len(rep) == 3:
min_rep, max_rep, end = rep
repeated = self._find_repeated_sequence(res)
for _ in range(min_rep - 1):
res.extend(repeated)
for _ in range(min_rep, max_rep):
res.extend(repeated)
res.append("?")
idx = end + 1
return res

def _preprocess_optional(self):
regex_temp = []
for symbol in self._python_regex:
Expand Down
27 changes: 26 additions & 1 deletion pyformlang/regular_expression/tests/test_python_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def test_shortcut_word(self):
def _test_compare(self, regex, s_test):
r_pyformlang = PythonRegex(regex)
r_python = re.compile(regex)
self.assertEqual(r_python.match(s_test) is not None, r_pyformlang.accepts(s_test))
self.assertEqual(r_python.fullmatch(s_test) is not None, r_pyformlang.accepts(s_test))

def test_backslash(self):
self._test_compare(".*", "]")
Expand Down Expand Up @@ -283,3 +283,28 @@ def test_dot_harder(self):
self._test_compare(r"\.", ".")
self._test_compare(r"\\\.", "\\a")
self._test_compare(r"\\\.", "\\.")

def test_single_repetition(self):
self._test_compare(r"\d{3}-\d{3}-\d{4}", "012-876-3789")
self._test_compare(r"a{5}b", "ab")
self._test_compare(r"a{5}b", "aaaaab")
self._test_compare(r"a{5b", "aaaaab")
self._test_compare(r"a{5b", "a{5b")
self._test_compare(r"T{4}P{3}", "TTTTTTPPPPPPPPPPPP")

def test_range_repetition(self):
self._test_compare(r"a{2,5}b", "ab")
self._test_compare(r"a{2,5}b", "aab")
self._test_compare(r"a{2,5}b", "aaaaab")
self._test_compare(r"a{2,5}b", "aaaaaab")
self._test_compare(r"a{2,5,7}b", "aaaaab")
self._test_compare(r"a{2,5,7}b", "a{2,5,7}b")
self._test_compare(r"ab{2,5}", "ab")
self._test_compare(r"ab{2,5}", "abbb")
self._test_compare(r"ab{2,5}", "abbbbb")
self._test_compare(r"ab{2,5}", "abbbbbbbbb")
self._test_compare(r"[a-z]{1,3}", "")
self._test_compare(r"[a-z]{1,3}", "d")
self._test_compare(r"[a-z]{1,3}", "do")
self._test_compare(r"[a-z]{1,3}", "dpo")
self._test_compare(r"[a-z]{1,3}", "dpoz")

1 comment on commit 61ac29b

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
pyformlang
   __init__.py10100% 
pyformlang/cfg
   __init__.py70100% 
   cfg.py5500100% 
   cfg_object.py100100% 
   cyk_table.py720100% 
   epsilon.py60100% 
   llone_parser.py15622 99%
   parse_tree.py6211 98%
   pda_object_creator.py310100% 
   production.py330100% 
   recursive_decent_parser.py5611 98%
   set_queue.py140100% 
   terminal.py150100% 
   utils.py100100% 
   utils_cfg.py250100% 
   variable.py260100% 
pyformlang/cfg/tests
   __init__.py00100% 
   test_cfg.py59811 99%
   test_llone_parser.py11811 99%
   test_production.py220100% 
   test_recursive_decent_parser.py250100% 
   test_terminal.py190100% 
   test_variable.py160100% 
pyformlang/fcfg
   __init__.py40100% 
   fcfg.py11711 99%
   feature_production.py260100% 
   feature_structure.py19133 98%
   state.py350100% 
pyformlang/fcfg/tests
   __init__.py00100% 
   test_fcfg.py1230100% 
   test_feature_structure.py1590100% 
pyformlang/finite_automaton
   __init__.py100100% 
   deterministic_finite_automaton.py20722 99%
   doubly_linked_list.py340100% 
   doubly_linked_node.py150100% 
   epsilon.py100100% 
   epsilon_nfa.py3720100% 
   finite_automaton.py1620100% 
   finite_automaton_object.py100100% 
   hopcroft_processing_list.py220100% 
   nondeterministic_finite_automaton.py220100% 
   nondeterministic_transition_function.py500100% 
   partition.py360100% 
   regexable.py160100% 
   state.py150100% 
   symbol.py110100% 
   transition_function.py5111 98%
pyformlang/finite_automaton/tests
   __init__.py00100% 
   test_deterministic_finite_automaton.py2610100% 
   test_epsilon.py100100% 
   test_epsilon_nfa.py6210100% 
   test_nondeterministic_finite_automaton.py930100% 
   test_nondeterministic_transition_function.py610100% 
   test_state.py280100% 
   test_symbol.py270100% 
   test_transition_function.py600100% 
pyformlang/fst
   __init__.py20100% 
   fst.py2420100% 
pyformlang/fst/tests
   __init__.py00100% 
   test_fst.py1600100% 
pyformlang/indexed_grammar
   __init__.py70100% 
   consumption_rule.py340100% 
   duplication_rule.py300100% 
   end_rule.py300100% 
   indexed_grammar.py25722 99%
   production_rule.py320100% 
   reduced_rule.py250100% 
   rule_ordering.py700100% 
   rules.py690100% 
pyformlang/indexed_grammar/tests
   __init__.py00100% 
   test_indexed_grammar.py2250100% 
   test_rules.py360100% 
pyformlang/pda
   __init__.py60100% 
   cfg_variable_converter.py6744 94%
   epsilon.py40100% 
   pda.py3090100% 
   stack_symbol.py160100% 
   state.py180100% 
   symbol.py140100% 
   transition_function.py460100% 
   utils.py360100% 
pyformlang/pda/tests
   __init__.py00100% 
   test_pda.py2460100% 
pyformlang/regular_expression
   __init__.py40100% 
   python_regex.py25866 98%
   regex.py1430100% 
   regex_objects.py790100% 
   regex_reader.py16044 98%
pyformlang/regular_expression/tests
   __init__.py00100% 
   test_python_regex.py25422 99%
   test_regex.py2490100% 
pyformlang/rsa
   __init__.py30100% 
   box.py3866 84%
   recursive_automaton.py8766 93%
pyformlang/rsa/tests
   __init__.py00100% 
   test_rsa.py510100% 
pyformlang/tests
   __init__.py00100% 
TOTAL80384399% 

Tests Skipped Failures Errors Time
270 0 💤 0 ❌ 0 🔥 3.598s ⏱️

Please sign in to comment.