diff --git a/data/rag_eval/results/plots/context_precision.png b/data/rag_eval/results/plots/context_precision.png deleted file mode 100644 index e1843d0..0000000 Binary files a/data/rag_eval/results/plots/context_precision.png and /dev/null differ diff --git a/data/rag_eval/results/plots/context_recall.png b/data/rag_eval/results/plots/context_recall.png deleted file mode 100644 index f1765be..0000000 Binary files a/data/rag_eval/results/plots/context_recall.png and /dev/null differ diff --git a/data/rag_eval/results/plots/context_relevancy.png b/data/rag_eval/results/plots/context_relevancy.png deleted file mode 100644 index fbed320..0000000 Binary files a/data/rag_eval/results/plots/context_relevancy.png and /dev/null differ diff --git a/data/rag_eval/results/plots/plot.png b/data/rag_eval/results/plots/plot.png new file mode 100644 index 0000000..7388276 Binary files /dev/null and b/data/rag_eval/results/plots/plot.png differ diff --git a/data/rag_eval/results/results.json b/data/rag_eval/results/results.json index d0c1d48..54abe26 100644 --- a/data/rag_eval/results/results.json +++ b/data/rag_eval/results/results.json @@ -4,11 +4,6 @@ "context_recall": 0, "context_relevancy": 0 }, - { - "context_precision": 0.9819999999999999, - "context_recall": 0.9400000000000002, - "context_relevancy": 0 - }, { "context_precision": 0.9819999999999999, "context_recall": 0.9400000000000002, diff --git a/test/benchmarks/rag/evaluation.py b/test/benchmarks/rag/evaluation.py index 6b1d1c5..062cbe6 100644 --- a/test/benchmarks/rag/evaluation.py +++ b/test/benchmarks/rag/evaluation.py @@ -185,7 +185,10 @@ def evaluate(vdb: Store, qa_paths: list, endpoint: str, metrics: list, return metrics, eval_dataset -def update_evaluation_plots(results_df: pd.DataFrame, metrics: list, modified=True): +def update_evaluation_plots(results_df: pd.DataFrame, metrics: list, + modified=True, + rows: int = 1, + cols: int = 3): if len(metrics) == 0: raise ValueError('No metrics specified.') @@ -206,8 +209,9 @@ def update_evaluation_plots(results_df: pd.DataFrame, metrics: list, modified=Tr # Add new results res: pd.Series = results_df.mean() - new_results = {metric_name: res[metric_name] if metric_name in res else content[len(content) - 1][metric_name] - for metric_name in content[0].keys()} + new_results = { + metric_name: res[metric_name] if metric_name in res else content[len(content) - 1][metric_name] + for metric_name in content[0].keys()} content.append(new_results) fp.seek(0) @@ -216,33 +220,44 @@ def update_evaluation_plots(results_df: pd.DataFrame, metrics: list, modified=Tr else: history = results_df - def plot_eval(plot_df: pd.DataFrame, name: str): + # Ensure the grid has enough space for all metrics + total_metrics = len(metrics) + if rows * cols < total_metrics: + raise ValueError(f'Grid size ({rows}x{cols}) is too small for {total_metrics} metrics.') + + # Create a single plot with subplots for each metric + fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 5)) + axes = axes.flatten() # Flatten in case of a single row or column + + def plot_eval(ax, plot_df: pd.DataFrame, name: str): """Create a plot for an evaluation metric, the columns should be named 'x' and 'y'""" - sns.lineplot(data=plot_df, x='x', y='y', zorder=0) - plt.scatter( + sns.lineplot(data=plot_df, x='x', y='y', ax=ax, zorder=0) + ax.scatter( plot_df.iloc[1:]['x'], plot_df.iloc[1:]['y'], color='#000000', s=15, zorder=1 ) - - plt.ylim(0, 1) - plt.xticks(range(0, len(plot_df))) - - plt.title(f'RAG Evaluation: {name}') - plt.ylabel(name) - plt.xlabel('') - return plt + ax.set_ylim(0, 1) + ax.set_xticks(range(0, len(plot_df))) + ax.set_title(f'RAG Evaluation: {name}') + ax.set_ylabel(name) + ax.set_xlabel('') # Output the updated evaluation plots - plots = {} - for col in history.columns: + for i, col in enumerate(history.columns): values = history[col].to_list() - plots[col] = [{'x': i, 'y': val} for i, val in enumerate(values)] - metric_plot_df = pd.DataFrame(plots[col]) - plot = plot_eval(metric_plot_df, col) - plot.savefig(f'../../../data/rag_eval/results/plots/{col}.png') + metric_plot_df = pd.DataFrame([{'x': i, 'y': val} for i, val in enumerate(values)]) + plot_eval(axes[i], metric_plot_df, col) + + # Hide any unused subplots + for j in range(i + 1, len(axes)): + fig.delaxes(axes[j]) + + plt.tight_layout() + plt.savefig(f'../../../data/rag_eval/results/plots/plot.png') + plt.close() def main(plot_only=False): @@ -279,4 +294,4 @@ def main(plot_only=False): if __name__ == '__main__': - main(plot_only=False) + main(plot_only=True)