Acceptance Test: conversion prompts first run

antoninoLorenzo · Jul 3, 2024 · 948412c · 948412c
1 parent 087d112
commit 948412c
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 8 deletions.
diff --git a/test/tests/__pycache__/test_prompts.cpython-311-pytest-8.2.2.pyc b/test/tests/__pycache__/test_prompts.cpython-311-pytest-8.2.2.pyc
diff --git a/test/tests/results/conversion_times.json b/test/tests/results/conversion_times.json
@@ -1 +1 @@
-{"llama3": {"times": [37.51699495315552, 19.364502429962158, 48.94050359725952], "mean": 35.2740003267924}}
+{"llama3": {"times": [60.0516414642334, 19.22053861618042, 48.96596574783325], "mean": 42.74604860941569}, "phi3": {"times": [50.286476850509644, 14.586498260498047, 65.26603984832764], "mean": 43.37967165311178}, "gemma:7b": {"times": [115.44812870025635, 38.25345754623413, 118.4050064086914], "mean": 90.7021975517273}}
diff --git a/test/tests/test_prompts.py b/test/tests/test_prompts.py
@@ -12,19 +12,34 @@
 
 
 class TestPrompts(unittest.TestCase):
-    MODELS = ['llama3']
+    """
+    Conversion:
+    - llama3  : ok
+    - gemma2b : failing
+    - gemma7b : ok
+    - phi3    : ok
+    - qwen4b  : failing (a lot)
+    """
+    MODELS = ['llama3', 'phi3', 'gemma:7b']
     GEMINI_KEY = os.getenv('GEMINI_API_KEY')
 
-    @unittest.skip('')
+    # @unittest.skip('')
     def test_conversion(self):
         """Tests the conversion from natural language plan produced
-        by the llm to tasks, so tests the efficiency of the prompt."""
+        by the llm to tasks, so tests the efficiency of the prompt.
+        TODO:
+            + edge case: no commands in natural language plan
+            + add test cases in conversion.json
+        """
         with open('test_cases/conversion.json', 'r', encoding='utf-8') as fp:
             test_cases = json.load(fp)
 
         inference_times = {model: {'times': [], 'mean': 0} for model in self.MODELS}
         for model in self.MODELS:
             agent = Agent(model=model)
+            # TODO:
+            #   should make a first query to load LLM into Ollama,
+            #   otherwise, conversion times are biased by loading time
             for test_case in test_cases:
                 plan_nl = test_case['content']
                 expected_commands = test_case['commands']
@@ -33,18 +48,18 @@ def test_conversion(self):
                 plan = agent.extract_plan(plan_nl)
                 t = time.time() - start
 
-                self.assertIsNotNone(plan, "Plan is None:")
+                self.assertIsNotNone(plan, f"[{model}] Plan is None:")
                 commands = [task.command for task in plan.tasks]
                 self.assertEquals(
                     len(commands),
                     len(expected_commands),
-                    f"Found {len(commands)} commands, expected {len(expected_commands)}\n"
+                    f"[{model}] Found {len(commands)} commands, expected {len(expected_commands)}\n"
                     f"Commands:\n{commands}\nExpected:\n{expected_commands}"
                 )
                 self.assertEquals(
                     commands,
                     expected_commands,
-                    f"Commands:\n{commands}\nExpected:\n{expected_commands}"
+                    f"[{model}] Commands:\n{commands}\nExpected:\n{expected_commands}"
                 )
 
                 inference_times[model]['times'].append(t)
@@ -55,7 +70,7 @@ def test_conversion(self):
                 inference_times[model]['mean'] = mean_time
             json.dump(inference_times, fp)
 
-    # @unittest.skip('')
+    @unittest.skip('')
     def test_planning(self):
         """Tests the instruction following capability of the llm"""
         self.assertIsNotNone(self.GEMINI_KEY, 'Missing Gemini API Key')