Update RAG Evaluation prompts + re-compute

antoninoLorenzo · Jul 16, 2024 · 93ac087 · 93ac087
1 parent 7b6f651
commit 93ac087
Show file tree

Hide file tree

Showing 9 changed files with 191 additions and 70 deletions.
diff --git a/data/rag_eval/owasp_50.json b/data/rag_eval/owasp_50.json
diff --git a/data/rag_eval/results/plots/context_precision.png b/data/rag_eval/results/plots/context_precision.png
diff --git a/data/rag_eval/results/plots/context_recall.png b/data/rag_eval/results/plots/context_recall.png
diff --git a/data/rag_eval/results/results.json b/data/rag_eval/results/results.json
@@ -4,7 +4,7 @@
         "context_recall": 0
     },
     {
-        "context_precision": 0.4337735369385255,
-        "context_recall": 0.42125943462841486
+        "context_precision": 0.968,
+        "context_recall": 0.9359999999999999
     }
 ]
diff --git a/test/benchmarks/rag/__pycache__/metrics.cpython-311.pyc b/test/benchmarks/rag/__pycache__/metrics.cpython-311.pyc
diff --git a/test/benchmarks/rag/evaluation.py b/test/benchmarks/rag/evaluation.py
@@ -133,17 +133,18 @@ def evaluate(vdb: Store, qa_paths: list, endpoint: str,
 
     # Setup evaluation metrics
     llm = LLM(model='gemma2:9b', client_url=endpoint)
-    ctx_recall = ContextRecall(
-        EVAL_PROMPTS[evaluation_model]['context_recall']['sys'],
-        EVAL_PROMPTS[evaluation_model]['context_recall']['usr'],
-        llm
-    )
     ctx_precision = ContextPrecision(
         EVAL_PROMPTS[evaluation_model]['context_precision']['sys'],
         EVAL_PROMPTS[evaluation_model]['context_precision']['usr'],
         llm
     )
 
+    ctx_recall = ContextRecall(
+        EVAL_PROMPTS[evaluation_model]['context_recall']['sys'],
+        EVAL_PROMPTS[evaluation_model]['context_recall']['usr'],
+        llm
+    )
+
     # Run
     recall = []
     for i, item in tqdm(eval_dataset.iterrows(), total=len(eval_dataset), desc='Measuring Context Recall'):
@@ -158,10 +159,11 @@ def evaluate(vdb: Store, qa_paths: list, endpoint: str,
         ans = item.answer
         precision.append(ctx_precision.compute(qst, ans, ctx))
 
-    return pd.DataFrame({
+    metrics = pd.DataFrame({
         'context_recall': recall,
         'context_precision': precision
     })
+    return metrics, eval_dataset
 
 
 def update_evaluation_plots(results_df: pd.DataFrame):
@@ -223,14 +225,15 @@ def plot_eval(plot_df: pd.DataFrame, name: str):
         '../../../data/rag_eval/owasp_50.json',
     ]
 
-    eval_results_df = evaluate(
+    metrics_df, eval_output_dataset = evaluate(
         vdb=knowledge_base,
         qa_paths=synthetic_qa_paths,
         endpoint=OLLAMA_ENDPOINT
     )
-    print(eval_results_df.head())
-    eval_results_df.to_json('./tmp.json')
+    print(metrics_df.head())
+    metrics_df.to_json('./tmp_metrics.json')
+    eval_output_dataset.to_json('./tmp_eval_ds.json')
 
     # eval_results_df = pd.read_json('./tmp.json')
 
-    update_evaluation_plots(eval_results_df)
+    update_evaluation_plots(metrics_df)
diff --git a/test/benchmarks/rag/metrics.py b/test/benchmarks/rag/metrics.py
@@ -11,69 +11,79 @@
 
 from src.agent.llm import LLM, Ollama
 
-# TODO:
-#   rating could be done categorically instead of numerically
-#   ex. great = 1; good = 0.7; inaccurate = 0.3; bad = 0
+
 EVAL_PROMPTS = {
     'gemma2:9b': {
         'context_recall': {
-            'sys': textwrap.dedent("""
-                Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only "Yes" (1) or "No" (0) as a binary classification. 
-                
-                Your output should contain a list of 0 or 1 for each sentence, also it should be a JSON string as follows:
-                {{"result": [1, 0, ...]}}
-                
-                IMPORTANT:
-                - Only provide the JSON string in the specified format. Do not include any additional text.
-                - If the answer mentions that available information wasn't sufficient, your response should be the following: {{"result": [0]}}
-            """),
-            'usr': textwrap.dedent("""
-                Answer:
-                {answer}
-                
-                Context:
-                {context}
-                
-                Your output should contain a list of 0 or 1 for each sentence, also it should be a JSON string as follows:
-                {{"result": [1, 0, ...]}}
-                
-                IMPORTANT:
-                - Only provide the JSON string in the specified format. Do not include any additional text.
-                - If the answer mentions that available information wasn't sufficient, your response should be the following: {{"result": [0]}}
-            """)
+            'sys': """
+Given a context and an answer, you should analyze the answer. Then ensure the answer information is confirmed by the context information. 
+Evaluate the overall accuracy of the answer based on the given context. Use the following categorical scoring system for your classification:
+- "great" (1): The answer is highly relevant and clearly supported by the context.
+- "good" (0.7): The answer is relevant and somewhat supported by the context.
+- "inaccurate" (0.3): The answer is not clearly supported by the context but is not entirely irrelevant.
+- "bad" (0): The answer contains information that contradicts the context, indicating a hallucination.
+
+Your output should contain a single categorical score for the overall answer, formatted as a JSON string as follows:
+{{"result": "great" | "good" | "inaccurate" | "bad"}}
+
+Evaluation Guidelines:
+- Only provide the JSON string in the specified format. Do not include any additional text.
+- If the answer mentions that available information wasn't sufficient, your response should be the following: {{"result": "bad"}}
+- Ensure your assessment is based on how well the overall answer aligns with the given context.""",
+            'usr': """Answer:
+{answer}
+
+Context:
+{context}
+
+Your output should contain a single categorical score for the overall answer, formatted as a JSON string as follows:
+{{"result": "great" | "good" | "inaccurate" | "bad"}}
+
+IMPORTANT:
+- Remember to follow the "Evaluation Guidelines"
+- Provide only the JSON string, do not provide any explanation."""
         },
         'context_precision': {
-            'sys': textwrap.dedent("""
-                Given question, answer and context verify if the context was useful in arriving at the given answer. 
-                Use only "Useful" (1) or "Not Useful" (0) as a binary classification. 
-                
-                Your output should contain a list of 0 or 1 for each sentence, also it should be a JSON string as follows:
-                {{"result": [1, 0, ...]}}
-                
-                IMPORTANT:
-                - Only provide the JSON string in the specified format. Do not include explanations or any additional text.
-                - If the answer do not provide a response to the question or mentions that available information wasn't sufficient, your response should be the following:  {{"result": [0]}}
-            """),
-            'usr': textwrap.dedent("""
-                Question:
-                {question}
-                
-                Context:
-                {context}
-                
-                Answer:
-                {answer}
-                
-                Your output should contain a list of 0 or 1 for each sentence, also it should be a JSON string as follows:
-                {{"result": [1, 0, ...]}}
-                
-                IMPORTANT:
-                - Only provide the JSON string in the specified format. Do not include explanations or any additional text.
-                - If the answer do not provide a response to the question or mentions that available information wasn't sufficient, your response should be the following:  {{"result": [0]}}
-            """)
+            'sys': """
+Given a question, answer, and context, evaluate if the context was useful in arriving at the given answer. Use the following categorical scoring system for your classification:
+- "great" (1): The context was highly useful and directly supported the answer.
+- "good" (0.7): The context was useful and somewhat supported the answer.
+- "inaccurate" (0.3): The context was only slightly useful and not directly supporting the answer.
+- "bad" (0): The context was not useful or irrelevant in arriving at the answer.
+
+Your output should contain a single categorical score for the overall answer, formatted as a JSON string as follows:
+{{"result": "great" | "good" | "inaccurate" | "bad"}}
+
+Evaluation Guidelines:
+- Only provide the JSON string in the specified format. Do not include any additional text.
+- If the answer does not respond to the question or mentions that available information wasn't sufficient, your response should be the following: {{"result": "not useful"}}
+- Ensure your assessment is based on how well the context was useful in arriving at the answer.""",
+            'usr': """Question:
+{question}
+
+Context:
+{context}
+
+Answer:
+{answer}
+
+Your output should contain a single categorical score for the overall answer, formatted as a JSON string as follows:
+{{"result": "great" | "good" | "inaccurate" | "bad"}}
+
+IMPORTANT:
+- Remember to follow the "Evaluation Guidelines"
+- Provide only the JSON string, do not provide any explanation."""
         }
     }
 }
+
+METRICS_VALUES = {
+    'great': 1,
+    'good': 0.7,
+    'inaccurate': 0.3,
+    'bad': 0
+}
+
 JSON_PATTERN = r'{"result": \[[^\]]*\]}'
 
 
@@ -94,11 +104,12 @@ def extract_response(response):
         """Extracts the json results from response"""
         try:
             # TODO: validate response response type
-            return np.mean(json.loads(response)['result'])
+            label = json.loads(response)['result']
+            return METRICS_VALUES[label] if label in METRICS_VALUES else 0
         except JSONDecodeError:
             match = re.search(JSON_PATTERN, response)
             if match:
-                return np.mean(json.loads(match.group())['result'])
+                return float(json.loads(match.group())['result'])
             else:
                 return response
 

diff --git a/test/benchmarks/rag/tmp_eval_ds.json b/test/benchmarks/rag/tmp_eval_ds.json
diff --git a/test/benchmarks/rag/tmp_metrics.json b/test/benchmarks/rag/tmp_metrics.json
@@ -0,0 +1 @@
+{"context_recall":{"0":1.0,"1":1.0,"2":1.0,"3":0.7,"4":1.0,"5":1.0,"6":1.0,"7":1.0,"8":1.0,"9":1.0,"10":1.0,"11":1.0,"12":1.0,"13":1.0,"14":0.7,"15":1.0,"16":1.0,"17":1.0,"18":1.0,"19":1.0,"20":0.7,"21":1.0,"22":1.0,"23":0.7,"24":1.0,"25":1.0,"26":1.0,"27":1.0,"28":0.7,"29":1.0,"30":1.0,"31":1.0,"32":1.0,"33":1.0,"34":1.0,"35":1.0,"36":1.0,"37":1.0,"38":1.0,"39":1.0,"40":0.7,"41":1.0,"42":1.0,"43":1.0,"44":1.0,"45":0.3,"46":1.0,"47":1.0,"48":1.0,"49":0.3},"context_precision":{"0":1.0,"1":1.0,"2":1.0,"3":0.3,"4":1.0,"5":1.0,"6":1.0,"7":1.0,"8":1.0,"9":1.0,"10":0.7,"11":1.0,"12":1.0,"13":1.0,"14":1.0,"15":1.0,"16":1.0,"17":1.0,"18":1.0,"19":1.0,"20":1.0,"21":1.0,"22":1.0,"23":1.0,"24":1.0,"25":1.0,"26":1.0,"27":1.0,"28":0.7,"29":1.0,"30":1.0,"31":1.0,"32":1.0,"33":1.0,"34":1.0,"35":1.0,"36":1.0,"37":1.0,"38":1.0,"39":1.0,"40":1.0,"41":1.0,"42":1.0,"43":1.0,"44":1.0,"45":0.7,"46":1.0,"47":1.0,"48":1.0,"49":1.0}}