Skip to content

Commit

Permalink
Acceptance Test: conversion prompts first run
Browse files Browse the repository at this point in the history
  • Loading branch information
antoninoLorenzo committed Jul 3, 2024
1 parent 087d112 commit 948412c
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 8 deletions.
Binary file modified test/tests/__pycache__/test_prompts.cpython-311-pytest-8.2.2.pyc
Binary file not shown.
2 changes: 1 addition & 1 deletion test/tests/results/conversion_times.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"llama3": {"times": [37.51699495315552, 19.364502429962158, 48.94050359725952], "mean": 35.2740003267924}}
{"llama3": {"times": [60.0516414642334, 19.22053861618042, 48.96596574783325], "mean": 42.74604860941569}, "phi3": {"times": [50.286476850509644, 14.586498260498047, 65.26603984832764], "mean": 43.37967165311178}, "gemma:7b": {"times": [115.44812870025635, 38.25345754623413, 118.4050064086914], "mean": 90.7021975517273}}
29 changes: 22 additions & 7 deletions test/tests/test_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,34 @@


class TestPrompts(unittest.TestCase):
MODELS = ['llama3']
"""
Conversion:
- llama3 : ok
- gemma2b : failing
- gemma7b : ok
- phi3 : ok
- qwen4b : failing (a lot)
"""
MODELS = ['llama3', 'phi3', 'gemma:7b']
GEMINI_KEY = os.getenv('GEMINI_API_KEY')

@unittest.skip('')
# @unittest.skip('')
def test_conversion(self):
"""Tests the conversion from natural language plan produced
by the llm to tasks, so tests the efficiency of the prompt."""
by the llm to tasks, so tests the efficiency of the prompt.
TODO:
+ edge case: no commands in natural language plan
+ add test cases in conversion.json
"""
with open('test_cases/conversion.json', 'r', encoding='utf-8') as fp:
test_cases = json.load(fp)

inference_times = {model: {'times': [], 'mean': 0} for model in self.MODELS}
for model in self.MODELS:
agent = Agent(model=model)
# TODO:
# should make a first query to load LLM into Ollama,
# otherwise, conversion times are biased by loading time
for test_case in test_cases:
plan_nl = test_case['content']
expected_commands = test_case['commands']
Expand All @@ -33,18 +48,18 @@ def test_conversion(self):
plan = agent.extract_plan(plan_nl)
t = time.time() - start

self.assertIsNotNone(plan, "Plan is None:")
self.assertIsNotNone(plan, f"[{model}] Plan is None:")
commands = [task.command for task in plan.tasks]
self.assertEquals(
len(commands),
len(expected_commands),
f"Found {len(commands)} commands, expected {len(expected_commands)}\n"
f"[{model}] Found {len(commands)} commands, expected {len(expected_commands)}\n"
f"Commands:\n{commands}\nExpected:\n{expected_commands}"
)
self.assertEquals(
commands,
expected_commands,
f"Commands:\n{commands}\nExpected:\n{expected_commands}"
f"[{model}] Commands:\n{commands}\nExpected:\n{expected_commands}"
)

inference_times[model]['times'].append(t)
Expand All @@ -55,7 +70,7 @@ def test_conversion(self):
inference_times[model]['mean'] = mean_time
json.dump(inference_times, fp)

# @unittest.skip('')
@unittest.skip('')
def test_planning(self):
"""Tests the instruction following capability of the llm"""
self.assertIsNotNone(self.GEMINI_KEY, 'Missing Gemini API Key')
Expand Down

0 comments on commit 948412c

Please sign in to comment.