TestPlan -> TestPrompts

antoninoLorenzo · Jul 3, 2024 · 04de5c7 · 04de5c7
1 parent 9ce40d0
commit 04de5c7
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 43 deletions.
diff --git a/test/tests/__pycache__/test_prompts.cpython-311-pytest-8.2.2.pyc b/test/tests/__pycache__/test_prompts.cpython-311-pytest-8.2.2.pyc
diff --git a/test/tests/results/conversion_times.json b/test/tests/results/conversion_times.json
@@ -0,0 +1 @@
+{"llama3": {"times": [37.51699495315552, 19.364502429962158, 48.94050359725952], "mean": 35.2740003267924}}
diff --git a/test/tests/test_cases/conversion.json b/test/tests/test_cases/conversion.json
@@ -43,11 +43,11 @@
             "SQL Injection Testing:\n\n",
             "We can use sqlmap to test for SQL injection vulnerabilities in the login.php page. Here's a plan:\n\n",
             "1. Initial Scan: Run sqlmap with the -T option to identify potential SQL injection points.\n",
-            "Command: sqlmap -T \"login.php\" http://10.10.62.242\n",
+            "Command: sqlmap -T login.php http://10.10.62.242\n",
             "2. Fingerprinting: Use sqlmap's fingerprinting feature to gather more information about the database and its schema.\n",
             "Command: sqlmap -F http://10.10.62.242/login.php\n",
             "3. Injection Testing: Run sqlmap with the -b option to test for SQL injection vulnerabilities.\n",
-            "Command: sqlmap -b \"login.php\" http://10.10.62.242\n\n",
+            "Command: sqlmap -b login.php http://10.10.62.242\n\n",
             "Accessing .htpasswd:\n\n",
             "Since the .htpasswd file is protected, we can try using Nmap's -script option to run a script that can help us access it.\n\n",
             "1. HTPasswd Script: Run Nmap with the htpasswd script to see if it can help us access the file.\n",
@@ -62,9 +62,9 @@
             "(Note: As always, I'll ensure that the necessary information is provided, and we stay within the scope of authorized penetration testing.)\n"
             ],
         "commands": [
-            "sqlmap -T \"login.php\" http://10.10.62.242",
+            "sqlmap -T login.php http://10.10.62.242",
             "sqlmap -F http://10.10.62.242/login.php",
-            "sqlmap -b \"login.php\" http://10.10.62.242",
+            "sqlmap -b login.php http://10.10.62.242",
             "nmap -script htpasswd http://10.10.62.242",
             "nmap -script http http://10.10.62.242"
         ]

diff --git a/test/tests/test_prompts.py b/test/tests/test_prompts.py
@@ -1,54 +1,64 @@
+import os
 import json
+import time
 import unittest
 
-from src.agent import Agent
-from src.agent.tools import Terminal
-from src.agent.plan import Plan, Task, TaskStatus
+import numpy as np
+from dotenv import load_dotenv
 
+from src.agent import Agent
 
-class TestPlan(unittest.TestCase):
+load_dotenv()
 
-    # def test_execute(self):
-    #     tasks = [
-    #         Task(thought="Get directory content", tool=Terminal, command="ls"),
-    #         Task(thought="Get machine host name", tool=Terminal, command="hostname")
-    #     ]
 
-    #     plan = Plan(tasks)
-    #     for output in plan.execute():
-    #         print('---------------------------------')
-    #         for i, task_overview in enumerate(output):
-    #             print(f'{i+1}. {task_overview}')
-    #             if task_overview.status == TaskStatus.DONE:
-    #                 print(f'Output:\n{task_overview.output}')
+class TestPrompts(unittest.TestCase):
+    MODELS = ['llama3']
+    GEMINI_KEY = os.getenv('GEMINI_API_KEY')
 
-    def test_from_response(self):
+    @unittest.skip('')
+    def test_conversion(self):
         """Tests the conversion from natural language plan produced
         by the llm to tasks, so tests the efficiency of the prompt."""
-        agent = Agent(model='llama3')
-        with open('plan_tests.json', 'r', encoding='utf-8') as fp:
+        with open('test_cases/conversion.json', 'r', encoding='utf-8') as fp:
             test_cases = json.load(fp)
 
-        for test_case in test_cases:
-            plan_nl = test_case['content']
-            expected_commands = test_case['commands']
-
-            plan = agent.extract_plan(plan_nl)
-            self.assertIsNotNone(plan, "Plan is None:")
-
-            commands = [task.command for task in plan.tasks]
-
-            self.assertEquals(
-                len(commands),
-                len(expected_commands),
-                f"Found {len(commands)} commands, expected {len(expected_commands)}\n"
-                f"Commands:\n{commands}\nExpected:\n{expected_commands}"
-            )
-            self.assertEquals(
-                commands,
-                expected_commands,
-                f"Commands:\n{commands}\nExpected:\n{expected_commands}"
-            )
+        inference_times = {model: {'times': [], 'mean': 0} for model in self.MODELS}
+        for model in self.MODELS:
+            agent = Agent(model=model)
+            for test_case in test_cases:
+                plan_nl = test_case['content']
+                expected_commands = test_case['commands']
+
+                start = time.time()
+                plan = agent.extract_plan(plan_nl)
+                t = time.time() - start
+
+                self.assertIsNotNone(plan, "Plan is None:")
+                commands = [task.command for task in plan.tasks]
+                self.assertEquals(
+                    len(commands),
+                    len(expected_commands),
+                    f"Found {len(commands)} commands, expected {len(expected_commands)}\n"
+                    f"Commands:\n{commands}\nExpected:\n{expected_commands}"
+                )
+                self.assertEquals(
+                    commands,
+                    expected_commands,
+                    f"Commands:\n{commands}\nExpected:\n{expected_commands}"
+                )
+
+                inference_times[model]['times'].append(t)
+
+        with open('results/conversion_times.json', 'w+', encoding='utf-8') as fp:
+            for model in self.MODELS:
+                mean_time = np.array(inference_times[model]['times']).mean()
+                inference_times[model]['mean'] = mean_time
+            json.dump(inference_times, fp)
+
+    # @unittest.skip('')
+    def test_planning(self):
+        """Tests the instruction following capability of the llm"""
+        self.assertIsNotNone(self.GEMINI_KEY, 'Missing Gemini API Key')
 
 
 if __name__ == "__main__":