Skip to content

Commit

Permalink
Update RAG Evaluation prompts + re-compute
Browse files Browse the repository at this point in the history
  • Loading branch information
antoninoLorenzo committed Jul 16, 2024
1 parent 7b6f651 commit 93ac087
Show file tree
Hide file tree
Showing 9 changed files with 191 additions and 70 deletions.
107 changes: 106 additions & 1 deletion data/rag_eval/owasp_50.json

Large diffs are not rendered by default.

Binary file modified data/rag_eval/results/plots/context_precision.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified data/rag_eval/results/plots/context_recall.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions data/rag_eval/results/results.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"context_recall": 0
},
{
"context_precision": 0.4337735369385255,
"context_recall": 0.42125943462841486
"context_precision": 0.968,
"context_recall": 0.9359999999999999
}
]
Binary file modified test/benchmarks/rag/__pycache__/metrics.cpython-311.pyc
Binary file not shown.
23 changes: 13 additions & 10 deletions test/benchmarks/rag/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,17 +133,18 @@ def evaluate(vdb: Store, qa_paths: list, endpoint: str,

# Setup evaluation metrics
llm = LLM(model='gemma2:9b', client_url=endpoint)
ctx_recall = ContextRecall(
EVAL_PROMPTS[evaluation_model]['context_recall']['sys'],
EVAL_PROMPTS[evaluation_model]['context_recall']['usr'],
llm
)
ctx_precision = ContextPrecision(
EVAL_PROMPTS[evaluation_model]['context_precision']['sys'],
EVAL_PROMPTS[evaluation_model]['context_precision']['usr'],
llm
)

ctx_recall = ContextRecall(
EVAL_PROMPTS[evaluation_model]['context_recall']['sys'],
EVAL_PROMPTS[evaluation_model]['context_recall']['usr'],
llm
)

# Run
recall = []
for i, item in tqdm(eval_dataset.iterrows(), total=len(eval_dataset), desc='Measuring Context Recall'):
Expand All @@ -158,10 +159,11 @@ def evaluate(vdb: Store, qa_paths: list, endpoint: str,
ans = item.answer
precision.append(ctx_precision.compute(qst, ans, ctx))

return pd.DataFrame({
metrics = pd.DataFrame({
'context_recall': recall,
'context_precision': precision
})
return metrics, eval_dataset


def update_evaluation_plots(results_df: pd.DataFrame):
Expand Down Expand Up @@ -223,14 +225,15 @@ def plot_eval(plot_df: pd.DataFrame, name: str):
'../../../data/rag_eval/owasp_50.json',
]

eval_results_df = evaluate(
metrics_df, eval_output_dataset = evaluate(
vdb=knowledge_base,
qa_paths=synthetic_qa_paths,
endpoint=OLLAMA_ENDPOINT
)
print(eval_results_df.head())
eval_results_df.to_json('./tmp.json')
print(metrics_df.head())
metrics_df.to_json('./tmp_metrics.json')
eval_output_dataset.to_json('./tmp_eval_ds.json')

# eval_results_df = pd.read_json('./tmp.json')

update_evaluation_plots(eval_results_df)
update_evaluation_plots(metrics_df)
125 changes: 68 additions & 57 deletions test/benchmarks/rag/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,69 +11,79 @@

from src.agent.llm import LLM, Ollama

# TODO:
# rating could be done categorically instead of numerically
# ex. great = 1; good = 0.7; inaccurate = 0.3; bad = 0

EVAL_PROMPTS = {
'gemma2:9b': {
'context_recall': {
'sys': textwrap.dedent("""
Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only "Yes" (1) or "No" (0) as a binary classification.
Your output should contain a list of 0 or 1 for each sentence, also it should be a JSON string as follows:
{{"result": [1, 0, ...]}}
IMPORTANT:
- Only provide the JSON string in the specified format. Do not include any additional text.
- If the answer mentions that available information wasn't sufficient, your response should be the following: {{"result": [0]}}
"""),
'usr': textwrap.dedent("""
Answer:
{answer}
Context:
{context}
Your output should contain a list of 0 or 1 for each sentence, also it should be a JSON string as follows:
{{"result": [1, 0, ...]}}
IMPORTANT:
- Only provide the JSON string in the specified format. Do not include any additional text.
- If the answer mentions that available information wasn't sufficient, your response should be the following: {{"result": [0]}}
""")
'sys': """
Given a context and an answer, you should analyze the answer. Then ensure the answer information is confirmed by the context information.
Evaluate the overall accuracy of the answer based on the given context. Use the following categorical scoring system for your classification:
- "great" (1): The answer is highly relevant and clearly supported by the context.
- "good" (0.7): The answer is relevant and somewhat supported by the context.
- "inaccurate" (0.3): The answer is not clearly supported by the context but is not entirely irrelevant.
- "bad" (0): The answer contains information that contradicts the context, indicating a hallucination.
Your output should contain a single categorical score for the overall answer, formatted as a JSON string as follows:
{{"result": "great" | "good" | "inaccurate" | "bad"}}
Evaluation Guidelines:
- Only provide the JSON string in the specified format. Do not include any additional text.
- If the answer mentions that available information wasn't sufficient, your response should be the following: {{"result": "bad"}}
- Ensure your assessment is based on how well the overall answer aligns with the given context.""",
'usr': """Answer:
{answer}
Context:
{context}
Your output should contain a single categorical score for the overall answer, formatted as a JSON string as follows:
{{"result": "great" | "good" | "inaccurate" | "bad"}}
IMPORTANT:
- Remember to follow the "Evaluation Guidelines"
- Provide only the JSON string, do not provide any explanation."""
},
'context_precision': {
'sys': textwrap.dedent("""
Given question, answer and context verify if the context was useful in arriving at the given answer.
Use only "Useful" (1) or "Not Useful" (0) as a binary classification.
Your output should contain a list of 0 or 1 for each sentence, also it should be a JSON string as follows:
{{"result": [1, 0, ...]}}
IMPORTANT:
- Only provide the JSON string in the specified format. Do not include explanations or any additional text.
- If the answer do not provide a response to the question or mentions that available information wasn't sufficient, your response should be the following: {{"result": [0]}}
"""),
'usr': textwrap.dedent("""
Question:
{question}
Context:
{context}
Answer:
{answer}
Your output should contain a list of 0 or 1 for each sentence, also it should be a JSON string as follows:
{{"result": [1, 0, ...]}}
IMPORTANT:
- Only provide the JSON string in the specified format. Do not include explanations or any additional text.
- If the answer do not provide a response to the question or mentions that available information wasn't sufficient, your response should be the following: {{"result": [0]}}
""")
'sys': """
Given a question, answer, and context, evaluate if the context was useful in arriving at the given answer. Use the following categorical scoring system for your classification:
- "great" (1): The context was highly useful and directly supported the answer.
- "good" (0.7): The context was useful and somewhat supported the answer.
- "inaccurate" (0.3): The context was only slightly useful and not directly supporting the answer.
- "bad" (0): The context was not useful or irrelevant in arriving at the answer.
Your output should contain a single categorical score for the overall answer, formatted as a JSON string as follows:
{{"result": "great" | "good" | "inaccurate" | "bad"}}
Evaluation Guidelines:
- Only provide the JSON string in the specified format. Do not include any additional text.
- If the answer does not respond to the question or mentions that available information wasn't sufficient, your response should be the following: {{"result": "not useful"}}
- Ensure your assessment is based on how well the context was useful in arriving at the answer.""",
'usr': """Question:
{question}
Context:
{context}
Answer:
{answer}
Your output should contain a single categorical score for the overall answer, formatted as a JSON string as follows:
{{"result": "great" | "good" | "inaccurate" | "bad"}}
IMPORTANT:
- Remember to follow the "Evaluation Guidelines"
- Provide only the JSON string, do not provide any explanation."""
}
}
}

METRICS_VALUES = {
'great': 1,
'good': 0.7,
'inaccurate': 0.3,
'bad': 0
}

JSON_PATTERN = r'{"result": \[[^\]]*\]}'


Expand All @@ -94,11 +104,12 @@ def extract_response(response):
"""Extracts the json results from response"""
try:
# TODO: validate response response type
return np.mean(json.loads(response)['result'])
label = json.loads(response)['result']
return METRICS_VALUES[label] if label in METRICS_VALUES else 0
except JSONDecodeError:
match = re.search(JSON_PATTERN, response)
if match:
return np.mean(json.loads(match.group())['result'])
return float(json.loads(match.group())['result'])
else:
return response

Expand Down
1 change: 1 addition & 0 deletions test/benchmarks/rag/tmp_eval_ds.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions test/benchmarks/rag/tmp_metrics.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"context_recall":{"0":1.0,"1":1.0,"2":1.0,"3":0.7,"4":1.0,"5":1.0,"6":1.0,"7":1.0,"8":1.0,"9":1.0,"10":1.0,"11":1.0,"12":1.0,"13":1.0,"14":0.7,"15":1.0,"16":1.0,"17":1.0,"18":1.0,"19":1.0,"20":0.7,"21":1.0,"22":1.0,"23":0.7,"24":1.0,"25":1.0,"26":1.0,"27":1.0,"28":0.7,"29":1.0,"30":1.0,"31":1.0,"32":1.0,"33":1.0,"34":1.0,"35":1.0,"36":1.0,"37":1.0,"38":1.0,"39":1.0,"40":0.7,"41":1.0,"42":1.0,"43":1.0,"44":1.0,"45":0.3,"46":1.0,"47":1.0,"48":1.0,"49":0.3},"context_precision":{"0":1.0,"1":1.0,"2":1.0,"3":0.3,"4":1.0,"5":1.0,"6":1.0,"7":1.0,"8":1.0,"9":1.0,"10":0.7,"11":1.0,"12":1.0,"13":1.0,"14":1.0,"15":1.0,"16":1.0,"17":1.0,"18":1.0,"19":1.0,"20":1.0,"21":1.0,"22":1.0,"23":1.0,"24":1.0,"25":1.0,"26":1.0,"27":1.0,"28":0.7,"29":1.0,"30":1.0,"31":1.0,"32":1.0,"33":1.0,"34":1.0,"35":1.0,"36":1.0,"37":1.0,"38":1.0,"39":1.0,"40":1.0,"41":1.0,"42":1.0,"43":1.0,"44":1.0,"45":0.7,"46":1.0,"47":1.0,"48":1.0,"49":1.0}}

0 comments on commit 93ac087

Please sign in to comment.