diff --git a/test/benchmarks/rag/dataset_generation.ipynb b/test/benchmarks/rag/dataset_generation.ipynb
index 0e87f0a..856cabd 100644
--- a/test/benchmarks/rag/dataset_generation.ipynb
+++ b/test/benchmarks/rag/dataset_generation.ipynb
@@ -59,24 +59,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 23,
    "id": "initial_id",
    "metadata": {
     "collapsed": true,
     "ExecuteTime": {
-     "end_time": "2024-06-18T10:36:25.913447Z",
-     "start_time": "2024-06-18T10:36:25.025936Z"
+     "end_time": "2024-07-16T08:20:48.261772Z",
+     "start_time": "2024-07-16T08:20:48.251272Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "True"
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "import os\n",
     "import json\n",
+    "import time\n",
     "import textwrap\n",
+    "import random\n",
     "from json import JSONDecodeError\n",
     "\n",
-    "import ollama\n",
     "import pandas as pd\n",
-    "from tqdm import tqdm"
+    "import google.generativeai as genai\n",
+    "from google.generativeai.types import HarmCategory, HarmBlockThreshold\n",
+    "from tqdm import tqdm\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()"
    ]
   },
   {
@@ -102,27 +118,22 @@
    "outputs": [],
    "source": [
     "# Question Generation Prompts\n",
-    "gen_query_sys = textwrap.dedent(\"\"\"\n",
+    "GEN_QUERY_PROMPT = textwrap.dedent(\"\"\"\n",
     "    As a question-generating assistant specializing in cybersecurity, your task is to generate simple, domain-specific questions based on the information given in a provided document, with a focus on Penetration Testing. \n",
     "    \n",
     "    You will be provided with the text of a document, surrounded by input tags. Please read the document, extract relevant information, and generate a simple, clear question based on the content. The question should be a maximum of two sentences long.\n",
     "    \n",
     "    Your response should be in the following JSON format:\n",
-    "    {\"QUESTION\": \"Your question here.\"}\n",
-    "\"\"\")\n",
-    "\n",
-    "gen_query_pr = textwrap.dedent(\"\"\"\n",
-    "    <input>{document}</input>\n",
-    "    \n",
-    "    Your response should be in the following JSON format:\n",
     "    {{\"QUESTION\": \"Your question here.\"}}\n",
+    "    \n",
+    "    <input>{document}</input>\n",
     "\"\"\")"
    ],
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-06-18T10:36:25.918935Z",
-     "start_time": "2024-06-18T10:36:25.914950Z"
+     "end_time": "2024-07-16T08:12:50.813210Z",
+     "start_time": "2024-07-16T08:12:50.809213Z"
     }
    },
    "id": "909ffcf7d895f882",
@@ -133,37 +144,30 @@
    "outputs": [],
    "source": [
     "# Question Answering (Ground Truth) Prompts\n",
-    "gen_answer_sys = textwrap.dedent(\"\"\"\n",
+    "GEN_ANSWER_PROMPT = textwrap.dedent(\"\"\"\n",
     "    As an answer-generating assistant specializing in cybersecurity, your task is to provide accurate answers for given questions in the context of Penetration Testing. You will be provided with a question and contextual information to generate a precise and relevant answer.\n",
     "    \n",
     "    Your answer should be in the following JSON format:\n",
-    "    {\"ANSWER\": \"Your answer here.\"}\n",
+    "    {{\"ANSWER\": \"Your answer here.\"}}\n",
     "    \n",
     "    Take a deep breath and work on this problem step by step.\n",
-    "\"\"\")\n",
-    "\n",
-    "gen_answer_pr = textwrap.dedent(\"\"\"\n",
-    "    Given the following question and context, provide an answer in the specified JSON format. The answer should address the question directly, without mentioning the context itself. If the context does not provide relevant information to answer the question, you may write “NOT FOUND” in the answer.\n",
     "    \n",
     "    Query:\n",
     "    <input>{query}</input>\n",
     "    \n",
     "    Context:\n",
     "    <input>{context}</input>\n",
-    "    \n",
-    "    Your response should be in the following JSON format:\n",
-    "    {{\"ANSWER\": \"Your answer here.\"}}\n",
     "\"\"\")"
    ],
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-06-18T10:36:25.925443Z",
-     "start_time": "2024-06-18T10:36:25.919940Z"
+     "end_time": "2024-07-16T08:17:10.954805Z",
+     "start_time": "2024-07-16T08:17:10.950804Z"
     }
    },
    "id": "b038c4b3396e7573",
-   "execution_count": 3
+   "execution_count": 19
   },
   {
    "cell_type": "markdown",
@@ -192,8 +196,8 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-06-18T10:36:29.411533Z",
-     "start_time": "2024-06-18T10:36:25.926435Z"
+     "end_time": "2024-07-16T08:13:03.410324Z",
+     "start_time": "2024-07-16T08:12:56.840434Z"
     }
    },
    "id": "28d4062e8f104809",
@@ -220,8 +224,8 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-06-18T10:36:29.429527Z",
-     "start_time": "2024-06-18T10:36:29.414030Z"
+     "end_time": "2024-07-16T08:13:15.420193Z",
+     "start_time": "2024-07-16T08:13:15.371693Z"
     }
    },
    "id": "46b9a52d84842463",
@@ -229,16 +233,7 @@
   },
   {
    "cell_type": "code",
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "D:\\Desktop\\prog\\Projects\\AI-OPS\\src\\agent\\knowledge\\chunker.py:18: UserWarning: [W008] Evaluating Span.similarity based on empty vectors.\n",
-      "  sim = sentences[i-1].similarity(sentences[i])\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "chunks = []\n",
     "for idx, item in owasp_df.iterrows():\n",
@@ -247,8 +242,8 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-06-18T10:36:30.512028Z",
-     "start_time": "2024-06-18T10:36:29.430530Z"
+     "end_time": "2024-07-16T08:13:20.887814Z",
+     "start_time": "2024-07-16T08:13:19.777309Z"
     }
    },
    "id": "c83fa403738f990a",
@@ -272,8 +267,8 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-06-18T10:36:30.517030Z",
-     "start_time": "2024-06-18T10:36:30.513045Z"
+     "end_time": "2024-07-16T08:13:23.687412Z",
+     "start_time": "2024-07-16T08:13:23.682913Z"
     }
    },
    "id": "d7e3ba9fb757051e",
@@ -301,18 +296,68 @@
    "cell_type": "code",
    "outputs": [],
    "source": [
-    "import random"
+    "# Gemini Setup\n",
+    "GEMINI_KEY = os.getenv('GEMINI_API_KEY')\n",
+    "genai.configure(api_key=GEMINI_KEY)\n",
+    "\n",
+    "llm = genai.GenerativeModel(\n",
+    "    'gemini-1.5-flash',\n",
+    "    generation_config={\"response_mime_type\": \"application/json\"}\n",
+    ")\n",
+    "\n",
+    "safety_settings = {\n",
+    "    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE\n",
+    "}"
    ],
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-06-18T10:36:30.521528Z",
-     "start_time": "2024-06-18T10:36:30.518028Z"
+     "end_time": "2024-07-16T08:13:29.764286Z",
+     "start_time": "2024-07-16T08:13:29.759304Z"
     }
    },
-   "id": "d009b37def9b2214",
+   "id": "38da027da6290bdb",
    "execution_count": 8
   },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "def gen_data(_chunks):\n",
+    "    # Generate Question\n",
+    "    gen_query = llm.generate_content(\n",
+    "        GEN_QUERY_PROMPT.format(document=_chunks), \n",
+    "        safety_settings=safety_settings\n",
+    "    )\n",
+    "\n",
+    "    try:\n",
+    "        question = json.loads(gen_query.text)['QUESTION']\n",
+    "    except JSONDecodeError:\n",
+    "        question = gen_query.text\n",
+    "\n",
+    "    # Generate Ground Truth\n",
+    "    gen_answer = llm.generate_content(\n",
+    "        GEN_ANSWER_PROMPT.format(query=question, context=_chunks), \n",
+    "        safety_settings=safety_settings\n",
+    "    )\n",
+    "    \n",
+    "    try:    \n",
+    "        answer = json.loads(gen_answer.text)['ANSWER']\n",
+    "    except JSONDecodeError:\n",
+    "        answer = gen_answer.text\n",
+    "    \n",
+    "    return question, answer"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-07-16T08:27:41.501482Z",
+     "start_time": "2024-07-16T08:27:41.496482Z"
+    }
+   },
+   "id": "bcf432f3f144dcfc",
+   "execution_count": 25
+  },
   {
    "cell_type": "code",
    "outputs": [
@@ -320,28 +365,35 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Generating q&a: 100%|██████████| 100/100 [22:26<00:00, 13.46s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "JSON Decode Errors: 20\n"
+      "Generating q&a:  64%|██████▍   | 32/50 [03:44<02:06,  7.02s/it]\n"
      ]
     },
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
+     "ename": "ResourceExhausted",
+     "evalue": "429 Resource has been exhausted (e.g. check quota).",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[1;31mResourceExhausted\u001B[0m                         Traceback (most recent call last)",
+      "Cell \u001B[1;32mIn[27], line 15\u001B[0m\n\u001B[0;32m     13\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m:\n\u001B[0;32m     14\u001B[0m     time\u001B[38;5;241m.\u001B[39msleep(\u001B[38;5;241m20\u001B[39m)\n\u001B[1;32m---> 15\u001B[0m     q, a \u001B[38;5;241m=\u001B[39m \u001B[43mgen_data\u001B[49m\u001B[43m(\u001B[49m\u001B[43mchosen_chunks\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m     17\u001B[0m i \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m=\u001B[39m \u001B[38;5;241m1\u001B[39m\n\u001B[0;32m     18\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m i \u001B[38;5;241m==\u001B[39m \u001B[38;5;241m0\u001B[39m:\n",
+      "Cell \u001B[1;32mIn[25], line 3\u001B[0m, in \u001B[0;36mgen_data\u001B[1;34m(_chunks)\u001B[0m\n\u001B[0;32m      1\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mgen_data\u001B[39m(_chunks):\n\u001B[0;32m      2\u001B[0m     \u001B[38;5;66;03m# Generate Question\u001B[39;00m\n\u001B[1;32m----> 3\u001B[0m     gen_query \u001B[38;5;241m=\u001B[39m \u001B[43mllm\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgenerate_content\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m      4\u001B[0m \u001B[43m        \u001B[49m\u001B[43mGEN_QUERY_PROMPT\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mformat\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdocument\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43m_chunks\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\n\u001B[0;32m      5\u001B[0m \u001B[43m        \u001B[49m\u001B[43msafety_settings\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43msafety_settings\u001B[49m\n\u001B[0;32m      6\u001B[0m \u001B[43m    \u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m      8\u001B[0m     \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m      9\u001B[0m         question \u001B[38;5;241m=\u001B[39m json\u001B[38;5;241m.\u001B[39mloads(gen_query\u001B[38;5;241m.\u001B[39mtext)[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mQUESTION\u001B[39m\u001B[38;5;124m'\u001B[39m]\n",
+      "File \u001B[1;32mD:\\Desktop\\prog\\Projects\\AI-OPS\\.venv\\Lib\\site-packages\\google\\generativeai\\generative_models.py:331\u001B[0m, in \u001B[0;36mGenerativeModel.generate_content\u001B[1;34m(self, contents, generation_config, safety_settings, stream, tools, tool_config, request_options)\u001B[0m\n\u001B[0;32m    329\u001B[0m         \u001B[38;5;28;01mreturn\u001B[39;00m generation_types\u001B[38;5;241m.\u001B[39mGenerateContentResponse\u001B[38;5;241m.\u001B[39mfrom_iterator(iterator)\n\u001B[0;32m    330\u001B[0m     \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m--> 331\u001B[0m         response \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_client\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgenerate_content\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m    332\u001B[0m \u001B[43m            \u001B[49m\u001B[43mrequest\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    333\u001B[0m \u001B[43m            \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mrequest_options\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    334\u001B[0m \u001B[43m        \u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m    335\u001B[0m         \u001B[38;5;28;01mreturn\u001B[39;00m generation_types\u001B[38;5;241m.\u001B[39mGenerateContentResponse\u001B[38;5;241m.\u001B[39mfrom_response(response)\n\u001B[0;32m    336\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m google\u001B[38;5;241m.\u001B[39mapi_core\u001B[38;5;241m.\u001B[39mexceptions\u001B[38;5;241m.\u001B[39mInvalidArgument \u001B[38;5;28;01mas\u001B[39;00m e:\n",
+      "File \u001B[1;32mD:\\Desktop\\prog\\Projects\\AI-OPS\\.venv\\Lib\\site-packages\\google\\ai\\generativelanguage_v1beta\\services\\generative_service\\client.py:827\u001B[0m, in \u001B[0;36mGenerativeServiceClient.generate_content\u001B[1;34m(self, request, model, contents, retry, timeout, metadata)\u001B[0m\n\u001B[0;32m    824\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_validate_universe_domain()\n\u001B[0;32m    826\u001B[0m \u001B[38;5;66;03m# Send the request.\u001B[39;00m\n\u001B[1;32m--> 827\u001B[0m response \u001B[38;5;241m=\u001B[39m \u001B[43mrpc\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m    828\u001B[0m \u001B[43m    \u001B[49m\u001B[43mrequest\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    829\u001B[0m \u001B[43m    \u001B[49m\u001B[43mretry\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mretry\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    830\u001B[0m \u001B[43m    \u001B[49m\u001B[43mtimeout\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtimeout\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    831\u001B[0m \u001B[43m    \u001B[49m\u001B[43mmetadata\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mmetadata\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    832\u001B[0m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m    834\u001B[0m \u001B[38;5;66;03m# Done; return the response.\u001B[39;00m\n\u001B[0;32m    835\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m response\n",
+      "File \u001B[1;32mD:\\Desktop\\prog\\Projects\\AI-OPS\\.venv\\Lib\\site-packages\\google\\api_core\\gapic_v1\\method.py:131\u001B[0m, in \u001B[0;36m_GapicCallable.__call__\u001B[1;34m(self, timeout, retry, compression, *args, **kwargs)\u001B[0m\n\u001B[0;32m    128\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_compression \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[0;32m    129\u001B[0m     kwargs[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcompression\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m compression\n\u001B[1;32m--> 131\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mwrapped_func\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
+      "File \u001B[1;32mD:\\Desktop\\prog\\Projects\\AI-OPS\\.venv\\Lib\\site-packages\\google\\api_core\\retry\\retry_unary.py:293\u001B[0m, in \u001B[0;36mRetry.__call__.<locals>.retry_wrapped_func\u001B[1;34m(*args, **kwargs)\u001B[0m\n\u001B[0;32m    289\u001B[0m target \u001B[38;5;241m=\u001B[39m functools\u001B[38;5;241m.\u001B[39mpartial(func, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[0;32m    290\u001B[0m sleep_generator \u001B[38;5;241m=\u001B[39m exponential_sleep_generator(\n\u001B[0;32m    291\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_initial, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_maximum, multiplier\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_multiplier\n\u001B[0;32m    292\u001B[0m )\n\u001B[1;32m--> 293\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mretry_target\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m    294\u001B[0m \u001B[43m    \u001B[49m\u001B[43mtarget\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    295\u001B[0m \u001B[43m    \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_predicate\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    296\u001B[0m \u001B[43m    \u001B[49m\u001B[43msleep_generator\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    297\u001B[0m \u001B[43m    \u001B[49m\u001B[43mtimeout\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_timeout\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    298\u001B[0m \u001B[43m    \u001B[49m\u001B[43mon_error\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mon_error\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    299\u001B[0m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n",
+      "File \u001B[1;32mD:\\Desktop\\prog\\Projects\\AI-OPS\\.venv\\Lib\\site-packages\\google\\api_core\\retry\\retry_unary.py:153\u001B[0m, in \u001B[0;36mretry_target\u001B[1;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001B[0m\n\u001B[0;32m    149\u001B[0m \u001B[38;5;66;03m# pylint: disable=broad-except\u001B[39;00m\n\u001B[0;32m    150\u001B[0m \u001B[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001B[39;00m\n\u001B[0;32m    151\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m exc:\n\u001B[0;32m    152\u001B[0m     \u001B[38;5;66;03m# defer to shared logic for handling errors\u001B[39;00m\n\u001B[1;32m--> 153\u001B[0m     \u001B[43m_retry_error_helper\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m    154\u001B[0m \u001B[43m        \u001B[49m\u001B[43mexc\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    155\u001B[0m \u001B[43m        \u001B[49m\u001B[43mdeadline\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    156\u001B[0m \u001B[43m        \u001B[49m\u001B[43msleep\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    157\u001B[0m \u001B[43m        \u001B[49m\u001B[43merror_list\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    158\u001B[0m \u001B[43m        \u001B[49m\u001B[43mpredicate\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    159\u001B[0m \u001B[43m        \u001B[49m\u001B[43mon_error\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    160\u001B[0m \u001B[43m        \u001B[49m\u001B[43mexception_factory\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    161\u001B[0m \u001B[43m        \u001B[49m\u001B[43mtimeout\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    162\u001B[0m \u001B[43m    \u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m    163\u001B[0m     \u001B[38;5;66;03m# if exception not raised, sleep before next attempt\u001B[39;00m\n\u001B[0;32m    164\u001B[0m     time\u001B[38;5;241m.\u001B[39msleep(sleep)\n",
+      "File \u001B[1;32mD:\\Desktop\\prog\\Projects\\AI-OPS\\.venv\\Lib\\site-packages\\google\\api_core\\retry\\retry_base.py:212\u001B[0m, in \u001B[0;36m_retry_error_helper\u001B[1;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001B[0m\n\u001B[0;32m    206\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m predicate_fn(exc):\n\u001B[0;32m    207\u001B[0m     final_exc, source_exc \u001B[38;5;241m=\u001B[39m exc_factory_fn(\n\u001B[0;32m    208\u001B[0m         error_list,\n\u001B[0;32m    209\u001B[0m         RetryFailureReason\u001B[38;5;241m.\u001B[39mNON_RETRYABLE_ERROR,\n\u001B[0;32m    210\u001B[0m         original_timeout,\n\u001B[0;32m    211\u001B[0m     )\n\u001B[1;32m--> 212\u001B[0m     \u001B[38;5;28;01mraise\u001B[39;00m final_exc \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01msource_exc\u001B[39;00m\n\u001B[0;32m    213\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m on_error_fn \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[0;32m    214\u001B[0m     on_error_fn(exc)\n",
+      "File \u001B[1;32mD:\\Desktop\\prog\\Projects\\AI-OPS\\.venv\\Lib\\site-packages\\google\\api_core\\retry\\retry_unary.py:144\u001B[0m, in \u001B[0;36mretry_target\u001B[1;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001B[0m\n\u001B[0;32m    142\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m sleep \u001B[38;5;129;01min\u001B[39;00m sleep_generator:\n\u001B[0;32m    143\u001B[0m     \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m--> 144\u001B[0m         result \u001B[38;5;241m=\u001B[39m \u001B[43mtarget\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m    145\u001B[0m         \u001B[38;5;28;01mif\u001B[39;00m inspect\u001B[38;5;241m.\u001B[39misawaitable(result):\n\u001B[0;32m    146\u001B[0m             warnings\u001B[38;5;241m.\u001B[39mwarn(_ASYNC_RETRY_WARNING)\n",
+      "File \u001B[1;32mD:\\Desktop\\prog\\Projects\\AI-OPS\\.venv\\Lib\\site-packages\\google\\api_core\\timeout.py:120\u001B[0m, in \u001B[0;36mTimeToDeadlineTimeout.__call__.<locals>.func_with_timeout\u001B[1;34m(*args, **kwargs)\u001B[0m\n\u001B[0;32m    117\u001B[0m     \u001B[38;5;66;03m# Avoid setting negative timeout\u001B[39;00m\n\u001B[0;32m    118\u001B[0m     kwargs[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtimeout\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mmax\u001B[39m(\u001B[38;5;241m0\u001B[39m, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_timeout \u001B[38;5;241m-\u001B[39m time_since_first_attempt)\n\u001B[1;32m--> 120\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfunc\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
+      "File \u001B[1;32mD:\\Desktop\\prog\\Projects\\AI-OPS\\.venv\\Lib\\site-packages\\google\\api_core\\grpc_helpers.py:78\u001B[0m, in \u001B[0;36m_wrap_unary_errors.<locals>.error_remapped_callable\u001B[1;34m(*args, **kwargs)\u001B[0m\n\u001B[0;32m     76\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m callable_(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[0;32m     77\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m grpc\u001B[38;5;241m.\u001B[39mRpcError \u001B[38;5;28;01mas\u001B[39;00m exc:\n\u001B[1;32m---> 78\u001B[0m     \u001B[38;5;28;01mraise\u001B[39;00m exceptions\u001B[38;5;241m.\u001B[39mfrom_grpc_error(exc) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mexc\u001B[39;00m\n",
+      "\u001B[1;31mResourceExhausted\u001B[0m: 429 Resource has been exhausted (e.g. check quota)."
      ]
     }
    ],
    "source": [
-    "dataset_size = 100\n",
+    "dataset_size = 50\n",
     "data = []\n",
-    "errors = 0\n",
+    "i = 3\n",
     "\n",
     "for _ in tqdm(range(dataset_size), total=dataset_size, desc='Generating q&a'):\n",
     "    # Get random chunks\n",
@@ -349,52 +401,57 @@
     "    n = random.choice([1, 2, 3])\n",
     "    chosen_chunks = random.choice(chunks[start:start+n])\n",
     "    \n",
-    "    # Generate Question\n",
-    "    gen_query = ollama.chat(\n",
-    "        model='gemma:2b',\n",
-    "        messages=[\n",
-    "            {'role': 'system', 'content': gen_query_sys},\n",
-    "            {'role': 'user', 'content': gen_query_pr.format(document=chosen_chunks)}\n",
-    "        ]\n",
-    "    )\n",
-    "    \n",
     "    try:\n",
-    "        question = json.loads(gen_query['message']['content'])['QUESTION']\n",
-    "    except JSONDecodeError:\n",
-    "        errors += 1\n",
-    "        question = gen_query['message']['content']\n",
-    "\n",
-    "    # Generate Ground Truth\n",
-    "    gen_answer = ollama.chat(\n",
-    "        model='gemma:2b',\n",
-    "        messages=[\n",
-    "            {'role': 'system', 'content': gen_answer_sys},\n",
-    "            {'role': 'user', 'content': gen_answer_pr.format(query=question, context=chosen_chunks)}\n",
-    "        ]\n",
-    "    )\n",
-    "    \n",
-    "    try:       \n",
-    "        answer = json.loads(gen_answer['message']['content'])['ANSWER']\n",
-    "    except JSONDecodeError:\n",
-    "        errors += 1\n",
-    "        answer = gen_answer['message']['content']\n",
+    "        q, a = gen_data(chosen_chunks)\n",
+    "    except Exception:\n",
+    "        time.sleep(20)\n",
+    "        q, a = gen_data(chosen_chunks)\n",
     "    \n",
+    "    i -= 1\n",
+    "    if i == 0:\n",
+    "        i = 3\n",
+    "        time.sleep(6)\n",
+    "        \n",
     "    data.append({\n",
     "        'context': chosen_chunks,\n",
-    "        'question': question,\n",
-    "        'ground_truth': answer\n",
-    "    })\n",
-    "print(f'JSON Decode Errors: {errors}')"
+    "        'question': q,\n",
+    "        'ground_truth': a\n",
+    "    })"
    ],
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-06-18T10:58:56.648274Z",
-     "start_time": "2024-06-18T10:36:30.522529Z"
+     "end_time": "2024-07-16T08:33:39.266111Z",
+     "start_time": "2024-07-16T08:29:54.616901Z"
     }
    },
    "id": "5cbb2eca96cca287",
-   "execution_count": 9
+   "execution_count": 27
+  },
+  {
+   "cell_type": "code",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "[{'context': 'Injection slides down to the third position. 94% of the applications were tested for some form of injection with a max incidence rate of 19%, an average incidence rate of 3%, and 274k occurrences.',\n  'question': 'While injection vulnerabilities are still prevalent, their occurrence has decreased, with only 19% of applications showing the highest incidence rate. What factors might have contributed to this decrease in injection vulnerabilities?',\n  'ground_truth': \"The decrease in injection vulnerabilities could be attributed to several factors:\\n\\n* **Increased Awareness and Education:**  The cybersecurity community has become more aware of injection vulnerabilities, leading to better understanding and practices among developers. Educational resources and training programs have helped raise awareness and promote secure coding practices.\\n* **Improved Development Tools and Frameworks:** Modern development tools and frameworks often incorporate security features that help mitigate injection vulnerabilities. These features can automatically detect and prevent certain types of injections, reducing the risk of errors.\\n* **Security Testing and Analysis Tools:**  Advanced security testing tools and automated vulnerability scanners are now available to detect injection vulnerabilities.  These tools are used during development and deployment to identify and fix issues before they become exploitable.\\n* **Static and Dynamic Code Analysis:**  Code analysis tools can help detect potential injection vulnerabilities during the development process. Static analysis examines code without actually executing it, while dynamic analysis checks code during execution. These techniques can help identify and fix potential issues early on.\\n* **Security Best Practices and Standards:**  Security best practices and standards, such as OWASP (Open Web Application Security Project), provide guidelines and recommendations for secure coding. Adhering to these standards helps mitigate injection vulnerabilities.\\n* **Increased Use of Secure Development Practices:**  Secure development practices, such as input validation and output encoding, are increasingly being adopted by developers to reduce the risk of injection attacks.\\n\\nWhile injection vulnerabilities have decreased, it's important to note that they are still a significant threat. Attackers are constantly developing new techniques to exploit these vulnerabilities, so it's essential to stay vigilant and continue implementing robust security measures.\"},\n {'context': 'Notable Common Weakness Enumerations (CWEs) included are CWE-79: Cross-site Scripting, CWE-89: SQL Injection, and CWE-73: External Control of File Name or Path.\\n Description\\nAn application is vulnerable to attack when:\\n* User-supplied data is not validated, filtered, or sanitized by the application.\\n Dynamic queries or non-parameterized calls without context-aware escaping are used directly in the interpreter.\\n Hostile data is used within object-relational mapping (ORM) search parameters to extract additional, sensitive records.\\n',\n  'question': 'How can an attacker exploit CWE-79, CWE-89, and CWE-73 vulnerabilities to gain access to sensitive information if user-supplied data is not properly validated or sanitized?',\n  'ground_truth': 'If user-supplied data is not properly validated or sanitized, an attacker can exploit vulnerabilities like CWE-79, CWE-89, and CWE-73 to gain access to sensitive information. \\n\\n* **CWE-79 (Cross-site Scripting):** Attackers can inject malicious JavaScript code into the application, which could be executed by unsuspecting users. This code could steal cookies, session tokens, or other sensitive data. For example, if an attacker can inject a script into a comment field on a website, that script could run when another user views the comment, potentially stealing their login credentials.\\n\\n* **CWE-89 (SQL Injection):** Attackers can inject malicious SQL code into data submitted to the application, manipulating the database queries to gain access to sensitive information or modify database contents. For example, an attacker could inject a SQL query into a login form to bypass authentication and gain access to the database.\\n\\n* **CWE-73 (External Control of File Name or Path):** Attackers can manipulate the application to access or modify files outside of the intended scope. This could allow an attacker to read sensitive data, modify system files, or execute malicious code. For example, an attacker could provide a file path to an image upload function that points to a system configuration file, allowing them to access or modify sensitive data.'},\n {'context': 'Shifting up one position to #2, previously known as Sensitive Data Exposure, which is more of a broad symptom rather than a root cause, the focus is on failures related to cryptography (or lack thereof). Which often lead to exposure of sensitive data.',\n  'question': 'What are some common failures related to cryptography that can lead to the exposure of sensitive data?',\n  'ground_truth': \"Common failures related to cryptography that can lead to the exposure of sensitive data include:\\n\\n* **Weak or outdated encryption algorithms:** Using algorithms that are easily cracked or have known vulnerabilities, like outdated versions of DES or MD5.\\n* **Poor key management:**  Improper generation, storage, or distribution of encryption keys, which can result in unauthorized access or compromise.\\n* **Insufficient key length:**  Using keys that are too short, making it easier for attackers to brute-force them.\\n* **Improper implementation:**  Incorrectly implementing encryption algorithms, leading to vulnerabilities that can be exploited.\\n* **Misuse of encryption:**  Using encryption for purposes it's not intended for, such as encrypting data at rest without also encrypting it in transit.\\n* **Lack of encryption:**  Not using encryption at all for sensitive data, leaving it vulnerable to eavesdropping or data breaches.\\n* **Cryptographic backdoors:**  Introducing intentional weaknesses into encryption systems, potentially at the request of governments, making it easier to decrypt data.\\n* **Key escrow:**  Storing copies of encryption keys in a central location, which can be compromised and expose the keys to attackers.\\n* **Side-channel attacks:**  Exploiting physical characteristics of a cryptographic system, such as timing, power consumption, or electromagnetic emissions, to extract secret information.\\n\\nThese failures highlight the importance of carefully selecting, implementing, and managing cryptographic solutions to protect sensitive data effectively.\"},\n {'context': 'Without logging and monitoring, breaches cannot be detected. Insufficient logging, detection, monitoring, and active response occurs any time:\\n* Auditable events, such as logins, failed logins, and high-value transactions, are not logged.\\n * Warnings and errors generate no, inadequate, or unclear log messages.\\n * Logs of applications and APIs are not monitored for suspicious activity.\\n Logs are only stored locally.\\n',\n  'question': 'What are the potential security risks associated with not logging and monitoring auditable events such as logins, failed logins, and high-value transactions?',\n  'ground_truth': 'Failing to log and monitor auditable events such as logins, failed logins, and high-value transactions presents significant security risks. Without these logs, it becomes impossible to detect breaches, investigate incidents, or identify malicious activities. This lack of visibility makes it significantly easier for attackers to compromise systems, steal sensitive data, and remain undetected for extended periods. Additionally, without proper logging, it becomes challenging to demonstrate compliance with security regulations and standards, potentially leading to legal repercussions.'},\n {'context': \"* Don't store sensitive data unnecessarily. Discard it as soon as possible or use PCI DSS compliant tokenization or even truncation. Data that is not retained cannot be stolen.\\n * Make sure to encrypt all sensitive data at rest.\\n * Ensure up-to-date and strong standard algorithms, protocols, and keys are in place; use proper key management.\\n\",\n  'question': 'How can an organization ensure the security of sensitive data at rest during a penetration test?',\n  'ground_truth': '{\"ANSWER\": \"An organization can ensure the security of sensitive data at rest during a penetration test by following these best practices:\\\\n\\\\n1. **Minimize Data Retention:** Only store sensitive data if absolutely necessary and discard it as soon as possible. This principle aligns with the concept of \\\\\"least privilege\\\\\" and reduces the potential attack surface.\\n2. **Implement Strong Encryption:** Encrypt all sensitive data at rest using robust encryption algorithms and protocols. This ensures that even if an attacker gains access to the storage medium, they cannot decrypt the data.\\n3. **Utilize Tokenization and Truncation:** Consider using PCI DSS compliant tokenization or truncation methods to replace sensitive data with non-sensitive representations. This approach further reduces the risk of data breaches.\\n4. **Maintain Strong Key Management:** Implement proper key management practices to protect encryption keys. This includes using secure key storage, rotation, and access controls.\\n5. **Keep Algorithms and Protocols Updated:** Ensure that the encryption algorithms, protocols, and keys used are up-to-date and meet industry standards. Regular updates help mitigate vulnerabilities discovered in older versions.\\n6. **Employ Secure Storage Solutions:** Choose secure storage solutions that incorporate access controls, audit trails, and other security features.\\n7. **Implement Secure Configuration Management:**  Ensure the secure configuration of storage systems and encryption software to minimize vulnerabilities. This includes proper patching and hardening practices.\\n8. **Conduct Regular Security Audits:** Perform regular security audits to identify and address any potential weaknesses in data protection measures.\"}\\n'},\n {'context': 'Are default crypto keys in use, weak crypto keys generated or re-used, or is proper key management or rotation missing? Are crypto keys checked into source code repositories?\\n* Is encryption not enforced, e.g., are any HTTP headers (browser) security directives or headers missing?\\n * Is the received server certificate and the trust chain properly validated?\\n* Are initialization vectors ignored, reused, or not generated sufficiently secure for the cryptographic mode of operation?',\n  'question': 'What measures are in place to ensure that encryption is properly enforced, including the presence of necessary HTTP headers and validation of the server certificate and trust chain?',\n  'ground_truth': \"The provided context focuses on assessing potential vulnerabilities related to encryption, specifically focusing on key management, HTTP header security, and server certificate validation.  While the context lists potential issues, it doesn't directly address measures taken to ensure proper encryption enforcement. To comprehensively answer your question, we need more information about the specific system or application being assessed.  For example, we would need to know:\\n\\n1. **The encryption algorithms and protocols used:**  What type of encryption is being employed (e.g., TLS, HTTPS, AES)? Understanding the specific algorithms is crucial to assess their strength and appropriate usage.\\n2. **The system's configuration:**  What are the specific settings related to encryption? This includes details about key management practices, key rotation schedules, and the implementation of HTTP headers.  \\n3. **The trust chain validation process:**  How are the server certificates and trust chains being validated? Are appropriate certificate authorities being used and validated? \\n4. **The security policies and procedures:** Are there documented policies and procedures for encryption management? These policies should address issues such as key generation, storage, and rotation. \\n\\nBy understanding these details, we can then evaluate the measures in place to ensure proper encryption enforcement and identify any potential weaknesses or gaps in security.  For example, if the system relies on default or weak encryption algorithms, or if key management practices are inadequate, it could be vulnerable to attacks. Additionally, if HTTP security headers are missing or misconfigured, the communication could be intercepted or tampered with.  Finally, inadequate validation of server certificates and trust chains could lead to man-in-the-middle attacks. \"},\n {'context': 'Ensure all login, access control, and server-side input validation failures can be logged with sufficient user context to identify suspicious or malicious accounts and held for enough time to allow delayed forensic analysis.\\n * Ensure that logs are generated in a format that log management solutions can easily consume.\\n * Ensure log data is encoded correctly to prevent injections or attacks on the logging or monitoring systems.\\n * Ensure high-value transactions have an audit trail with integrity controls to prevent tampering or deletion, such as append-only database tables or similar.\\n * DevSecOps teams should establish effective monitoring and alerting such that suspicious activities are detected and responded to quickly.\\n',\n  'question': 'How can login, access control, and server-side input validation failures be logged in a way that allows for easy identification of suspicious or malicious accounts and enables delayed forensic analysis?',\n  'ground_truth': 'To ensure effective logging for login, access control, and server-side input validation failures, several steps are crucial:\\n\\n1. **Detailed Logging:**  Logs should capture comprehensive information about each failure, including:\\n    * **User ID/Account:**  Clearly identify the user involved in the failure.\\n    * **Timestamp:**  Record the precise time of the failure.\\n    * **Failure Type:**  Specify the exact nature of the failure (e.g., invalid username, incorrect password, SQL injection attempt).\\n    * **Source IP Address:**  Log the IP address from which the failed attempt originated.\\n    * **User Agent:**  Record the user agent string (browser or application) used.\\n    * **Request Data:**  Capture the input data associated with the failed attempt, like login credentials or form submissions.\\n\\n2. **Standardized Format:** Logs should be generated in a format easily digestible by log management solutions. Common formats include Syslog, JSON, or CSV. This enables centralized analysis and correlation of events across different systems.\\n\\n3. **Secure Encoding:** Log data should be encoded correctly to prevent injection attacks on the logging or monitoring systems. This might involve escaping special characters, using secure protocols, or implementing appropriate sanitization techniques.\\n\\n4. **Persistent Storage:** Logs should be retained for an extended period to allow for delayed forensic analysis. The retention period should be determined based on legal and regulatory requirements, as well as the nature of the data and the potential for future investigations. Consider using append-only databases or similar solutions to prevent tampering or deletion.\\n\\n5. **Integrity Controls:** High-value transactions, such as financial transactions or critical data modifications, should have an audit trail with integrity controls. This ensures that every action is recorded and that records cannot be easily modified or deleted.\\n\\n6. **Monitoring and Alerting:**  Effective monitoring and alerting systems are essential to detect suspicious activities quickly. This involves setting up rules and thresholds for unusual events, such as sudden increases in login failures, failed access attempts, or input validation errors.  Alerts should be sent to appropriate security teams for timely investigation and response.'},\n {'context': 'The breach was reportedly caused by payment application security vulnerabilities exploited by attackers, who harvested more than 400,000 customer payment records. The airline was fined 20 million pounds as a result by the privacy regulator.',\n  'question': 'What specific payment application security vulnerabilities were exploited in the breach, and how did they allow attackers to harvest customer payment records?',\n  'ground_truth': 'The provided context does not specify the particular payment application security vulnerabilities exploited in the breach. Therefore, I cannot provide an answer to your question.  To answer this question, we would need additional information about the specific vulnerabilities exploited, such as whether they were related to insecure data storage, inadequate input validation, or lack of encryption. '},\n {'context': 'Threat modeling should be integrated into refinement sessions (or similar activities); look for changes in data flows and access control or other security controls. In the user story development determine the correct flow and failure states, ensure they are well understood and agreed upon by responsible and impacted parties. Analyze assumptions and conditions for expected and failure flows, ensure they are still accurate and desirable. Determine how to validate the assumptions and enforce conditions needed for proper behaviors. Ensure the results are documented in the user story.',\n  'question': 'During user story development, how can threat modeling be used to identify changes in data flows and access control, and what steps should be taken to ensure these changes are understood and agreed upon by the relevant stakeholders?',\n  'ground_truth': \"Threat modeling during user story development can be used to identify changes in data flows and access control by meticulously analyzing the story's narrative. The process involves identifying potential threats and vulnerabilities that may arise from these changes. This includes examining data flows, access permissions, and security controls. To ensure that these changes are understood and agreed upon by stakeholders, the following steps should be taken: 1. **Integrate threat modeling into user story refinement sessions**: This allows for a collaborative approach, bringing together developers, security experts, and stakeholders to discuss and address potential security implications. 2. **Determine the correct flow and failure states**: This involves mapping out the intended data flow and identifying potential failure points. Understanding these states helps in developing appropriate security measures. 3. **Ensure understanding and agreement**: The flow and failure states, along with any necessary security controls, should be clearly communicated to all stakeholders. This includes the development team, security team, and business users. 4. **Analyze assumptions and conditions**: The team should scrutinize assumptions and conditions related to data flows and security controls. Ensuring they are accurate and desirable helps prevent vulnerabilities. 5. **Validate assumptions and enforce conditions**:  Validating assumptions and implementing mechanisms to enforce security conditions is crucial. This can involve code reviews, security testing, and automated monitoring. 6. **Document findings in the user story**: All findings and security requirements should be documented within the user story itself, providing a clear and comprehensive record for future reference. By implementing these steps, threat modeling can effectively identify and address changes in data flows and access control, ensuring the security of the application and its data.\"},\n {'context': 'Log all failures and alert administrators when credential stuffing, brute force, or other attacks are detected.\\n * Use a server-side, secure, built-in session manager that generates a new random session ID with high entropy after login. Session identifier should not be in the URL, be securely stored, and invalidated after logout, idle, and absolute timeouts.\\n Example Attack Scenarios\\nScenario #1: Credential stuffing, the use of lists of known passwords, is a common attack. Suppose an application does not implement automated threat or credential stuffing protection.',\n  'question': 'What are the potential consequences if an application does not implement automated threat or credential stuffing protection, and how can a server-side, secure, built-in session manager help mitigate these risks?',\n  'ground_truth': 'Without automated threat or credential stuffing protection, an application becomes vulnerable to unauthorized access. Attackers can use lists of stolen credentials (credential stuffing) to try logging in with multiple accounts, potentially compromising user accounts and sensitive data.  A server-side, secure session manager with a robust session ID generation mechanism helps mitigate this risk by:\\n\\n* **Generating Strong Session IDs:**  Randomly generated session IDs with high entropy make it much harder for attackers to guess or brute-force valid sessions.\\n* **URL Hiding:**  Excluding the session identifier from the URL prevents attackers from easily exploiting it in cross-site scripting (XSS) attacks.\\n* **Secure Storage:**  Storing session data securely on the server-side prevents attackers from accessing it through client-side manipulation.\\n* **Timeout Mechanisms:**  Session timeouts (idle, absolute) automatically invalidate sessions, limiting the window of vulnerability for attackers to exploit compromised credentials.'},\n {'context': 'For all such data:\\n* Is any data transmitted in clear text? This concerns protocols such as HTTP, SMTP, FTP also using TLS upgrades like STARTTLS. External internet traffic is hazardous. Verify all internal traffic, e.g., between load balancers, web servers, or back-end systems.\\n * Are any old or weak cryptographic algorithms or protocols used either by default or in older code?\\n*',\n  'question': 'Does the system transmit any sensitive data in clear text, particularly through protocols like HTTP, SMTP, or FTP, even with TLS upgrades like STARTTLS?',\n  'ground_truth': \"This question checks if sensitive data is transmitted in plain text even with TLS upgrades like STARTTLS. It's crucial to identify any vulnerabilities where sensitive data might be exposed during transmission. The context emphasizes the importance of examining both external internet traffic and internal communication between systems like load balancers, web servers, and back-end systems.\"},\n {'context': 'Threat modeling should be integrated into refinement sessions (or similar activities); look for changes in data flows and access control or other security controls. In the user story development determine the correct flow and failure states, ensure they are well understood and agreed upon by responsible and impacted parties. Analyze assumptions and conditions for expected and failure flows, ensure they are still accurate and desirable. Determine how to validate the assumptions and enforce conditions needed for proper behaviors. Ensure the results are documented in the user story.',\n  'question': 'How can threat modeling be incorporated into user story development to identify potential vulnerabilities in data flows and access control during refinement sessions?',\n  'ground_truth': \"During user story refinement sessions, threat modeling can be incorporated to identify potential vulnerabilities in data flows and access control by analyzing the story's elements and processes. This involves identifying the data involved, who has access to it, how it is used, and the potential threats related to each step. Key steps include:\\n\\n1. **Data Flow Analysis:** Examine the user story to understand how data is created, stored, processed, transmitted, and consumed. Identify potential risks associated with each stage, such as unauthorized access, modification, disclosure, or deletion.\\n2. **Access Control Analysis:** Determine who has access to the data at each stage and whether the access controls are appropriate. Evaluate the potential for unauthorized access, escalation of privileges, or bypass of security controls.\\n3. **Failure State Analysis:** Explore the potential failure states of the user story, including unexpected input, errors, and denial of service attacks.  Identify how these failure states could impact data flow and access control.\\n4. **Assumption Validation:** Examine the assumptions made about the data flow and access control mechanisms. Determine how these assumptions can be validated and whether they are still valid in the context of evolving security threats.\\n5. **Condition Enforcement:**  Ensure the user story includes conditions that enforce proper behavior and prevent potential vulnerabilities. These conditions should be clearly documented and agreed upon by all stakeholders.\\n\\nBy integrating threat modeling into the user story development process, we can identify potential vulnerabilities early on, mitigating risks and improving the overall security of the system.\"},\n {'context': 'Acting as a user without being logged in or acting as an admin when logged in as a user.\\n * Metadata manipulation, such as replaying or tampering with a JSON Web Token (JWT) access control token, or a cookie or hidden field manipulated to elevate privileges or abusing JWT invalidation.\\n * CORS misconfiguration allows API access from unauthorized/untrusted origins.\\n * Force browsing to authenticated pages as an unauthenticated user or to privileged pages as a standard user.\\n How to Prevent\\nAccess control is only effective in trusted server-side code or server-less API, where the attacker cannot modify the access control check or metadata.\\n',\n  'question': 'How can an attacker exploit metadata manipulation techniques, such as replaying or tampering with JWT access tokens, to gain unauthorized access or elevate privileges?',\n  'ground_truth': \"An attacker can exploit metadata manipulation techniques to gain unauthorized access or elevate privileges in several ways:\\n\\n* **Replaying JWT Access Tokens:** Attackers can intercept and record a valid JWT access token issued to a legitimate user. They can then replay this token to gain access to resources or perform actions that the original user is authorized to do, bypassing authentication mechanisms.\\n* **Tampering with JWT Access Tokens:** Attackers can modify the contents of a JWT access token to change user roles, permissions, or other sensitive information. For instance, they can change the 'role' claim from 'user' to 'admin' to gain administrative privileges. This can be done by exploiting vulnerabilities in the JWT implementation or by manipulating the token's signature.\\n* **Abusing JWT Invalidation:** Attackers can try to force invalidation of a user's valid JWT token, potentially leading to a denial of service attack or forcing the user to re-authenticate. They might achieve this by exploiting weaknesses in the token invalidation mechanism or by triggering false invalidations.\\n* **Manipulating Cookies or Hidden Fields:** Similar to JWT tokens, cookies or hidden form fields can be manipulated to change user roles or permissions. Attackers might try to modify these elements to gain unauthorized access or escalate privileges.\\n\\nThese techniques effectively bypass authentication and authorization mechanisms, allowing attackers to gain access to sensitive resources or elevate their privileges within the system.\"},\n {'context': 'Automated testing of all parameters, headers, URL, cookies, JSON, SOAP, and XML data inputs is strongly encouraged. Organizations can include static (SAST), dynamic (DAST), and interactive (IAST) application security testing tools into the CI/CD pipeline to identify introduced injection flaws before production deployment.\\n How to Prevent\\nPreventing injection requires keeping data separate from commands and queries:\\n* The preferred option is to use a safe API, which avoids using the interpreter entirely, provides a parameterized interface, or migrates to Object Relational Mapping Tools (ORMs).Note: Even when parameterized, stored procedures can still introduce SQL injection if PL/SQL or T-SQL concatenates queries and data or executes hostile data with EXECUTE IMMEDIATE or exec().\\n * Use positive server-side input validation.',\n  'question': 'What are some ways to prevent injection flaws in applications, and what are the limitations of parameterized stored procedures?',\n  'ground_truth': \"Several methods can be employed to prevent injection flaws in applications, primarily focusing on separating data from commands and queries. Here are some key strategies:\\n\\n1. **Utilize Safe APIs:**  This approach is the most robust as it eliminates the need for direct interaction with interpreters. Safe APIs provide a parameterized interface, allowing for secure data handling. Additionally, migrating to Object Relational Mapping Tools (ORMs) further enhances security by abstracting database interactions. \\n\\n2. **Parameterized Stored Procedures:** While generally considered safe, parameterized stored procedures have limitations. They can still introduce SQL injection vulnerabilities if the stored procedure code itself concatenates queries and data using PL/SQL or T-SQL functions like EXECUTE IMMEDIATE or exec(), enabling the execution of malicious data. \\n\\n3. **Positive Server-Side Input Validation:** This technique involves validating user inputs against predefined rules and formats. By ensuring data adheres to expected patterns, it minimizes the risk of injection attacks. \\n\\n4. **Automated Testing:** Integrating static (SAST), dynamic (DAST), and interactive (IAST) application security testing tools into the CI/CD pipeline is crucial. These tools automatically test parameters, headers, URLs, cookies, JSON, SOAP, and XML data inputs, effectively identifying and mitigating injection vulnerabilities before production deployment. \\n\\nIt's essential to remember that even with these measures, vulnerability prevention is an ongoing process. Regular security audits and updates are required to stay ahead of evolving threats.\"},\n {'context': \"* Ensure libraries and dependencies, such as npm or Maven, are consuming trusted repositories. If you have a higher risk profile, consider hosting an internal known-good repository that's vetted.\\n * Ensure that a software supply chain security tool, such as OWASP Dependency Check or OWASP CycloneDX, is used to verify that components do not contain known vulnerabilities\\n* Ensure that there is a review process for code and configuration changes to minimize the chance that malicious code or configuration could be introduced into your software pipeline.\\n * Ensure that your CI/CD pipeline has proper segregation, configuration, and access control to ensure the integrity of the code flowing through the build and deploy processes.\\n\",\n  'question': 'How can a penetration tester evaluate the security of a software supply chain, considering the use of libraries, dependencies, and code review processes?',\n  'ground_truth': 'A penetration tester can evaluate the security of a software supply chain by focusing on several key areas: **1. Library and Dependency Security:** Verifying that libraries and dependencies are sourced from trusted repositories is crucial.  For high-risk scenarios, internal known-good repositories with vetted components should be considered. **2. Vulnerability Scanning:** Utilizing tools like OWASP Dependency Check or CycloneDX to scan for known vulnerabilities in components is essential. This helps identify potential security weaknesses within the software supply chain. **3. Code and Configuration Review:** Implementing a robust code and configuration change review process minimizes the risk of introducing malicious code or misconfigured components. This process should involve thorough analysis of both code and configuration changes. **4. CI/CD Pipeline Security:** Ensuring the CI/CD pipeline has appropriate segregation, configuration, and access control measures helps maintain the integrity of the code throughout the build and deployment processes. This includes limiting access to sensitive components and verifying that all processes operate as intended.'},\n {'context': 'Appropriate alerting thresholds and response escalation processes are not in place or effective.\\n * Penetration testing and scans by dynamic application security testing (DAST) tools (such as OWASP ZAP) do not trigger alerts.\\n The application cannot detect, escalate, or alert for active attacks in real-time or near real-time.\\n You are vulnerable to information leakage by making logging and alerting events visible to a user or an attacker (see A01:2021-Broken Access Control).\\n How to Prevent\\nDevelopers should implement some or all the following controls, depending on the risk of the application:\\n*',\n  'question': \"If penetration testing tools like OWASP ZAP don't trigger alerts, how can the application effectively detect and respond to active attacks in real-time?\",\n  'ground_truth': \"Even though penetration testing tools like OWASP ZAP might not trigger alerts due to insufficient alerting thresholds or escalation processes, a robust application security strategy should focus on real-time detection and response. This requires implementing comprehensive security measures that go beyond relying solely on penetration testing tools. To detect and respond to active attacks effectively, the application should incorporate the following: \\n\\n* **Real-time Monitoring and Threat Intelligence:** Implement a robust security information and event management (SIEM) system to monitor application logs and network traffic in real-time. This system should be integrated with threat intelligence feeds to identify suspicious activities and potential attacks. \\n\\n* **Intrusion Detection and Prevention Systems (IDS/IPS):** Deploy network-based or host-based intrusion detection and prevention systems to analyze network traffic and identify malicious patterns. These systems can block or alert on suspicious activities, including known attack signatures. \\n\\n* **Web Application Firewalls (WAFs):** Utilize a WAF to protect the application from common web-based attacks. WAFs can inspect incoming traffic, block malicious requests, and enforce security policies to prevent attacks like SQL injection and cross-site scripting (XSS). \\n\\n* **Behavioral Analysis and Anomaly Detection:** Implement machine learning algorithms and anomaly detection techniques to analyze application behavior and identify deviations from normal patterns. This can help identify and respond to zero-day attacks and other novel threats. \\n\\n* **Security Automation and Orchestration:** Automate security responses to suspicious activities, such as blocking malicious IP addresses or triggering incident response protocols. This allows for rapid and efficient handling of threats. \\n\\n* **Secure Development Practices:** Adopt secure coding practices throughout the software development lifecycle to minimize vulnerabilities. This includes using secure libraries, validating user input, and implementing robust authentication and authorization mechanisms. \\n\\nBy implementing these comprehensive security measures, the application can effectively detect and respond to active attacks in real-time, even if penetration testing tools don't trigger alerts. This proactive approach helps to mitigate the risk of information leakage and ensure ongoing application security.\"},\n {'context': 'Are default crypto keys in use, weak crypto keys generated or re-used, or is proper key management or rotation missing? Are crypto keys checked into source code repositories?\\n* Is encryption not enforced, e.g., are any HTTP headers (browser) security directives or headers missing?\\n * Is the received server certificate and the trust chain properly validated?\\n* Are initialization vectors ignored, reused, or not generated sufficiently secure for the cryptographic mode of operation?',\n  'question': 'Are default cryptographic keys being used, are weak keys being generated or reused, or is proper key management or rotation missing? If so, what measures are in place to mitigate these risks?',\n  'ground_truth': \"This assessment seeks to uncover vulnerabilities related to cryptographic key management, generation, and usage. We'll investigate if default keys are being used, if weak keys are generated or reused, and if proper key management and rotation practices are in place. Additionally, we'll check if cryptographic keys are stored in source code repositories, a major security risk. We will also examine whether encryption is properly enforced by analyzing HTTP headers for security directives. The assessment will further investigate the validation of server certificates and trust chains, ensuring their authenticity. Finally, we will verify the secure generation and use of initialization vectors (IVs) for different cryptographic modes of operation, ensuring they are not ignored, reused, or insufficiently secure.\"},\n {'context': \"* Don't store sensitive data unnecessarily. Discard it as soon as possible or use PCI DSS compliant tokenization or even truncation. Data that is not retained cannot be stolen.\\n * Make sure to encrypt all sensitive data at rest.\\n * Ensure up-to-date and strong standard algorithms, protocols, and keys are in place; use proper key management.\\n\",\n  'question': 'What are some recommended practices for protecting sensitive data at rest, considering encryption, data retention, and key management?',\n  'ground_truth': \"To protect sensitive data at rest, follow these recommended practices:\\n\\n1. **Minimize Data Retention:** Only retain sensitive data if absolutely necessary.  If possible, discard data as soon as it is no longer needed. Consider using PCI DSS compliant tokenization or even truncation to minimize the amount of sensitive data stored.\\n\\n2. **Encrypt All Sensitive Data:** Encrypt all sensitive data at rest, using strong encryption algorithms like AES-256. This ensures that even if an attacker gains access to the storage system, they won't be able to read the data.\\n\\n3. **Implement Secure Key Management:** Utilize robust key management practices to protect encryption keys. This involves:\\n    * **Key Rotation:** Regularly rotate encryption keys to mitigate the impact of key compromise. \\n    * **Key Separation:** Store keys separately from the data they protect to prevent attackers from accessing both simultaneously. \\n    * **Key Backup:**  Maintain secure backups of keys to ensure data recovery in case of key loss or corruption. \\n\\n4. **Use Strong Algorithms and Protocols:** Employ industry-standard, up-to-date encryption algorithms, protocols, and cryptographic tools to ensure data confidentiality and integrity.\"},\n {'context': 'Moving up from the fifth position, 94% of applications were tested for some form of broken access control with the average incidence rate of 3.81%, and has the most occurrences in the contributed dataset with over 318k.',\n  'question': 'What percentage of applications were tested for broken access control vulnerabilities, and what is the average incidence rate of this vulnerability?',\n  'ground_truth': '94% of applications were tested for broken access control vulnerabilities, with an average incidence rate of 3.81%.'},\n {'context': \"An attacker monitors network traffic (e.g., at an insecure wireless network), downgrades connections from HTTPS to HTTP, intercepts requests, and steals the user's session cookie. The attacker then replays this cookie and hijacks the user's (authenticated) session, accessing or modifying the user's private data. Instead of the above they could alter all transported data, e.g., the recipient of a money transfer.\\n Scenario #3: The password database uses unsalted or simple hashes to store everyone's passwords. A file upload flaw allows an attacker to retrieve the password database.\",\n  'question': 'If a password database uses unsalted or simple hashes, how could an attacker exploit a file upload flaw to gain access to the passwords?',\n  'ground_truth': 'If the password database uses unsalted or simple hashes, an attacker could exploit the file upload flaw to gain access to the passwords and then use readily available tools to crack the hashes and obtain the plain-text passwords. Since the hashes are unsalted, an attacker can use precomputed tables of common passwords and their hashes to quickly find the matching plain-text password for each user.  The attacker can also use rainbow tables, which are precomputed tables that store the hashes of millions or billions of passwords, to find the matching plain-text password.  The attacker could also use brute-force attacks to try different passwords until they find one that matches the hash.  In addition, an attacker could leverage a dictionary attack which uses a list of common words and phrases to try to guess the password.  Due to the unsalted nature of the hashes, these cracking methods are significantly more effective and will likely yield the plain-text password for each user relatively quickly. '},\n {'context': 'Questions and answers cannot be trusted as evidence of identity as more than one person can know the answers, which is why they are prohibited. Such code should be removed and replaced with a more secure design.\\n Scenario #2: A cinema chain allows group booking discounts and has a maximum of fifteen attendees before requiring a deposit. Attackers could threat model this flow and test if they could book six hundred seats and all cinemas at once in a few requests, causing a massive loss of income.\\n',\n  'question': 'If a cinema chain offers group booking discounts with a maximum of fifteen attendees before requiring a deposit, could an attacker exploit this system by booking a large number of seats, potentially causing significant financial loss?',\n  'ground_truth': 'Yes, an attacker could exploit this system by booking a large number of seats, potentially causing significant financial loss.  As the context describes, an attacker could exploit the group booking system by booking hundreds of seats at multiple cinemas, potentially causing a significant loss of income for the cinema chain. This is a valid concern that should be addressed in the security design of the group booking system.  A more secure design might involve setting a limit on the total number of seats bookable at a time, requiring a deposit for bookings exceeding a certain size, or implementing other security measures to prevent such attacks.'},\n {'context': 'There is a difference between insecure design and insecure implementation. We differentiate between design flaws and implementation defects for a reason, they have different root causes and remediation. A secure design can still have implementation defects leading to vulnerabilities that may be exploited. An insecure design cannot be fixed by a perfect implementation as by definition, needed security controls were never created to defend against specific attacks. One of the factors that contribute to insecure design is the lack of business risk profiling inherent in the software or system being developed, and thus the failure to determine what level of security design is required.\\n',\n  'question': 'What is the main reason why an insecure design cannot be fixed by a perfect implementation?',\n  'ground_truth': 'An insecure design cannot be fixed by a perfect implementation because it lacks the necessary security controls to defend against specific attacks. These controls were never created in the design phase, meaning even a flawless implementation cannot retroactively add them.'},\n {'context': \"* Do not ship or deploy with any default credentials, particularly for admin users.\\n Implement weak password checks, such as testing new or changed passwords against the top 10,000 worst passwords list.\\n * Align password length, complexity, and rotation policies with National Institute of Standards and Technology (NIST) 800-63b's guidelines in section 5.1.1 for Memorized Secrets or other modern, evidence-based password policies.\\n * Ensure registration, credential recovery, and API pathways are hardened against account enumeration attacks by using the same messages for all outcomes.\\n * Limit or increasingly delay failed login attempts, but be careful not to create a denial of service scenario.\",\n  'question': 'What measures can be taken to mitigate account enumeration attacks during registration, credential recovery, and API interactions?',\n  'ground_truth': 'To mitigate account enumeration attacks during registration, credential recovery, and API interactions, it is crucial to ensure that the same message is returned for all outcomes, regardless of success or failure. This prevents attackers from determining the existence of valid usernames or email addresses by analyzing variations in response messages. For example, returning a generic \"Invalid credentials\" message for both incorrect usernames and passwords, as well as for non-existent accounts, can effectively obfuscate this information from attackers. Additionally, it\\'s important to implement rate limiting and delay mechanisms for failed login attempts to deter brute-force attacks. However, it is crucial to avoid excessive delays or restrictions that could unintentionally result in denial of service for legitimate users.'},\n {'context': 'Being functional programmers, they tried to ensure that their code is immutable. The solution they came up with is serializing the user state and passing it back and forth with each request. An attacker notices the \"rO0\" Java object signature (in base64) and uses the Java Serial Killer tool to gain remote code execution on the application server.\\n',\n  'question': 'If the application uses Java serialization to pass user state between requests, what vulnerability could an attacker exploit, and what tool might they use to achieve remote code execution?',\n  'ground_truth': 'An attacker could exploit the **Java deserialization vulnerability**. This vulnerability allows an attacker to inject malicious code into a serialized object, which is then executed when the object is deserialized by the application. The **Java Serial Killer** tool could be used to achieve remote code execution by creating a malicious serialized object that exploits this vulnerability.'},\n {'context': 'Security logging and monitoring came from the Top 10 community survey (#3), up slightly from the tenth position in the OWASP Top 10 2017.',\n  'question': 'In the OWASP Top 10 2017, what position did security logging and monitoring occupy?',\n  'ground_truth': 'Security logging and monitoring occupied the tenth position in the OWASP Top 10 2017.'},\n {'context': 'Injection slides down to the third position. 94% of the applications were tested for some form of injection with a max incidence rate of 19%, an average incidence rate of 3%, and 274k occurrences.',\n  'question': 'What percentage of applications were found to be vulnerable to injection attacks during the penetration testing, and how many occurrences of injection vulnerabilities were identified?',\n  'ground_truth': 'During the penetration testing, 94% of applications were tested for injection vulnerabilities, with a maximum incidence rate of 19%. A total of 274,000 occurrences of injection vulnerabilities were identified.'},\n {'context': \"* Ensure libraries and dependencies, such as npm or Maven, are consuming trusted repositories. If you have a higher risk profile, consider hosting an internal known-good repository that's vetted.\\n * Ensure that a software supply chain security tool, such as OWASP Dependency Check or OWASP CycloneDX, is used to verify that components do not contain known vulnerabilities\\n* Ensure that there is a review process for code and configuration changes to minimize the chance that malicious code or configuration could be introduced into your software pipeline.\\n * Ensure that your CI/CD pipeline has proper segregation, configuration, and access control to ensure the integrity of the code flowing through the build and deploy processes.\\n\",\n  'question': 'How can a penetration tester exploit vulnerabilities in libraries and dependencies used in a software development pipeline?',\n  'ground_truth': \"A penetration tester can exploit vulnerabilities in libraries and dependencies by leveraging several techniques. They could use automated tools like OWASP Dependency Check to identify known vulnerabilities in the project's dependencies. If a vulnerability is discovered, the tester could attempt to exploit it by creating a malicious package that mimics a legitimate library. This package could then be introduced into the software development pipeline, potentially allowing the attacker to gain access to sensitive information or disrupt the system. They could also analyze the code for potential vulnerabilities, such as insecure coding practices that could be exploited in the libraries themselves. By understanding the code flow and interactions with libraries, penetration testers can identify opportunities to inject malicious code or gain unauthorized access. Additionally, they could use social engineering techniques to persuade developers to install malicious dependencies, allowing the attacker to gain control of the system.\"},\n {'context': \"Suppose one of these applications is the admin console, and default accounts weren't changed. In that case, the attacker logs in with default passwords and takes over.\\n Scenario #2: Directory listing is not disabled on the server. An attacker discovers they can simply list directories. The attacker finds and downloads the compiled Java classes, which they decompile and reverse engineer to view the code.\",\n  'question': \"If an attacker can access a web server's directory listing, what security vulnerability might they exploit to obtain and analyze the application's source code?\",\n  'ground_truth': \"The vulnerability being exploited here is **directory listing vulnerability**. This allows the attacker to access and list the contents of directories on the web server.  By listing directories, the attacker can locate and download files, including compiled application code like Java classes. These classes can then be decompiled and analyzed to understand the application's logic and potentially find security weaknesses.\"},\n {'context': 'Appropriate alerting thresholds and response escalation processes are not in place or effective.\\n * Penetration testing and scans by dynamic application security testing (DAST) tools (such as OWASP ZAP) do not trigger alerts.\\n The application cannot detect, escalate, or alert for active attacks in real-time or near real-time.\\n You are vulnerable to information leakage by making logging and alerting events visible to a user or an attacker (see A01:2021-Broken Access Control).\\n How to Prevent\\nDevelopers should implement some or all the following controls, depending on the risk of the application:\\n*',\n  'question': \"If penetration tests and DAST tool scans don't trigger alerts, how effectively can the application detect and respond to real-time attacks?\",\n  'ground_truth': \"If penetration tests and DAST tool scans don't trigger alerts, it indicates that the application's security monitoring and alerting mechanisms are ineffective. This means the application is likely unable to detect and respond to real-time attacks.  Without proper alerting thresholds and escalation processes, the application cannot detect, escalate, or alert for active attacks in real-time or near real-time. This vulnerability leaves the application susceptible to various attacks, including information leakage through exposed logging and alerting events. To address this, developers should implement robust security monitoring, define appropriate alert thresholds, establish effective response escalation processes, and ensure proper logging and access control to prevent unauthorized access to sensitive information.\"},\n {'context': 'Such flaws can be accidental (e.g., coding error) or intentional (e.g., a backdoor in a component). Some example exploitable component vulnerabilities discovered are:\\n* CVE-2017-5638, a Struts 2 remote code execution vulnerability that enables the execution of arbitrary code on the server, has been blamed for significant breaches.\\n * While the internet of things (IoT) is frequently difficult or impossible to patch, the importance of patching them can be great (e.g., biomedical devices).\\n There are automated tools to help attackers find unpatched or misconfigured systems. For example, the Shodan IoT search engine can help you find devices that still suffer from Heartbleed vulnerability patched in April 2014.',\n  'question': 'What are some examples of exploitable component vulnerabilities and how can these vulnerabilities be used to compromise systems?',\n  'ground_truth': 'Exploitable component vulnerabilities can be exploited to compromise systems in various ways. One example is **CVE-2017-5638, a Struts 2 remote code execution vulnerability.** This flaw allowed attackers to execute arbitrary code on the server, leading to significant breaches. Another example is the **Heartbleed vulnerability, which affected OpenSSL, a widely used cryptographic library.** Attackers could exploit this vulnerability to steal sensitive data, such as passwords and encryption keys. These vulnerabilities can be used by attackers to gain unauthorized access to systems, steal data, or even launch further attacks. These vulnerabilities are often exploited by attackers using automated tools such as Shodan, which can help them find unpatched or misconfigured systems.'},\n {'context': \"The attacker then finds a severe access control flaw in the application.\\n Scenario #3: The application server's configuration allows detailed error messages, e.g., stack traces, to be returned to users. This potentially exposes sensitive information or underlying flaws such as component versions that are known to be vulnerable.\\n Scenario #4: A cloud service provider (CSP) has default sharing permissions open to the Internet by other CSP users. This allows sensitive data stored within cloud storage to be accessed.\",\n  'question': 'What type of sensitive information could be revealed by an application server configuration that returns detailed error messages, such as stack traces, to users?',\n  'ground_truth': 'Detailed error messages, such as stack traces, can reveal sensitive information like internal system architecture, database schemas, code structure, and specific component versions. This information could be exploited by attackers to identify vulnerabilities and craft targeted attacks.'},\n {'context': 'Threat modeling should be integrated into refinement sessions (or similar activities); look for changes in data flows and access control or other security controls. In the user story development determine the correct flow and failure states, ensure they are well understood and agreed upon by responsible and impacted parties. Analyze assumptions and conditions for expected and failure flows, ensure they are still accurate and desirable. Determine how to validate the assumptions and enforce conditions needed for proper behaviors. Ensure the results are documented in the user story.',\n  'question': 'During user story refinement, how can threat modeling be integrated to identify potential vulnerabilities related to data flows, access control, and other security controls?',\n  'ground_truth': 'During user story refinement, threat modeling can be integrated to identify potential vulnerabilities by following these steps:\\n\\n1. **Data Flow Analysis:**  Analyze the data flows described in the user story. Identify where data is created, stored, processed, transmitted, and accessed.  Look for opportunities for data leakage, interception, or modification.  Identify critical data points and understand the security requirements around them.\\n\\n2. **Access Control Review:** Examine the access controls defined in the user story.  Consider who has access to what data and at what stages of the process.  Identify potential vulnerabilities like unauthorized access, privilege escalation, or data exfiltration.\\n\\n3. **Security Control Assessment:** Evaluate the security controls mentioned in the user story.  Determine if the controls are appropriate for the identified threats and vulnerabilities. Consider the effectiveness of controls such as encryption, authentication, authorization, logging, and monitoring.\\n\\n4. **Assumption and Condition Validation:**  Carefully review the assumptions and conditions related to data flow, access control, and security controls.  Ensure these assumptions remain valid and that the conditions are enforceable.  Identify potential issues that might arise from incorrect or incomplete assumptions.\\n\\n5. **Failure State Analysis:**  Examine the defined failure states in the user story.  Determine how vulnerabilities might be exploited in these failure scenarios. Consider how to mitigate risks and recover from such situations.\\n\\n6. **Documentation and Agreement:**  Document the findings of the threat modeling integration process.  Include details about identified vulnerabilities, recommended mitigations, and any necessary changes to the user story. Ensure alignment and agreement on the proposed security measures with responsible and impacted parties.\\n\\nBy integrating threat modeling into user story refinement, you can proactively identify and address security risks early in the development lifecycle, leading to more secure and robust software solutions.'}]"
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-07-16T08:34:52.939729Z",
+     "start_time": "2024-07-16T08:34:52.931731Z"
+    }
+   },
+   "id": "8db32941a75d9373",
+   "execution_count": 28
   },
   {
    "cell_type": "code",
diff --git a/test/benchmarks/rag/evaluation.py b/test/benchmarks/rag/evaluation.py
index aa59185..90d1100 100644
--- a/test/benchmarks/rag/evaluation.py
+++ b/test/benchmarks/rag/evaluation.py
@@ -16,11 +16,11 @@
 
 from src.agent.llm import LLM
 from src.agent.knowledge import Store, Collection, Document, Topic
-from test.benchmarks.rag.metrics import HuggingFaceLLM, ContextRecall, ContextPrecision, EVAL_PROMPTS
+from test.benchmarks.rag.metrics import ContextRecall, ContextPrecision, EVAL_PROMPTS
 
 
 GEN_PROMPT = {
-    'gemma:2b': {
+    'gemma2:9b': {
         'sys': textwrap.dedent("""
             You are a Cybersecurity professional assistant, your job is to provide an answer to context specific questions.
             You will be provided with additional Context information to provide an answer.
@@ -71,7 +71,7 @@ def init_knowledge_base(data: dict[str: list[Topic]]) -> Store:
     return store
 
 
-def generate_evaluation_dataset(vdb: Store, qa_paths: list, model: str = 'gemma:2b'):
+def generate_evaluation_dataset(vdb: Store, qa_paths: list, model: str = 'gemma2:9b'):
     """Uses the RAG pipeline to generate an evaluation dataset composed of
     questions and ground truths from Q&A dataset and context + answers from
     the RAG pipeline."""
@@ -107,11 +107,9 @@ def gen_context_answer(question: str, llm: LLM):
     return pd.DataFrame(eval_data)
 
 
-def evaluate(vdb: Store, qa_paths: list,
-             evaluation_api_key: str,
-             generation_model: str = 'gemma:2b',
-             evaluation_model: str = 'mistral:7b',
-             eval_hf_url: str = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"):
+def evaluate(vdb: Store, qa_paths: list, endpoint: str,
+             generation_model: str = 'gemma2:9b',
+             evaluation_model: str = 'gemma2:9b'):
     """Given the Vector Database and the synthetic Q&A dataset
     generated in `dataset_generation.ipynb` runs the evaluation
     process for the RAG pipeline.
@@ -125,16 +123,19 @@ def evaluate(vdb: Store, qa_paths: list,
     eval_dataset = generate_evaluation_dataset(vdb, qa_paths, generation_model)
 
     # Setup evaluation metrics
-    hf_llm = HuggingFaceLLM(eval_hf_url, evaluation_api_key)
+    llm = LLM(
+        model='gemma2:9b',
+        client_url=endpoint,
+    )
     ctx_recall = ContextRecall(
         EVAL_PROMPTS[evaluation_model]['context_recall']['sys'],
         EVAL_PROMPTS[evaluation_model]['context_recall']['usr'],
-        hf_llm
+        llm
     )
     ctx_precision = ContextPrecision(
         EVAL_PROMPTS[evaluation_model]['context_precision']['sys'],
         EVAL_PROMPTS[evaluation_model]['context_precision']['usr'],
-        hf_llm
+        llm
     )
 
     # Run
@@ -204,9 +205,9 @@ def plot_eval(plot_df: pd.DataFrame, name: str):
 
 if __name__ == '__main__':
     load_dotenv()
-    hf_api_key = os.environ.get('HF_API_KEY')
-    if not hf_api_key:
-        raise RuntimeError('Missing HuggingFace API Key in .env')
+    OLLAMA_ENDPOINT = os.environ.get('ENDPOINT')
+    if not OLLAMA_ENDPOINT:
+        raise RuntimeError('Missing environment variable "ENDPOINT"')
 
     knowledge_base: Store = init_knowledge_base({
         '../../../data/json/owasp.json': [Topic.WebPenetrationTesting]
@@ -217,5 +218,9 @@ def plot_eval(plot_df: pd.DataFrame, name: str):
         # '../../../data/rag_eval/owasp_100-200.json'
     ]
 
-    eval_results_df = evaluate(knowledge_base, synthetic_qa_paths, hf_api_key)
+    eval_results_df = evaluate(
+        vdb=knowledge_base,
+        qa_paths=synthetic_qa_paths,
+        endpoint=OLLAMA_ENDPOINT
+    )
     update_evaluation_plots(eval_results_df)
diff --git a/test/benchmarks/rag/metrics.py b/test/benchmarks/rag/metrics.py
index 428c6ab..baf447c 100644
--- a/test/benchmarks/rag/metrics.py
+++ b/test/benchmarks/rag/metrics.py
@@ -9,6 +9,8 @@
 import requests
 import numpy as np
 
+from src.agent.llm import LLM, Ollama
+
 
 EVAL_PROMPTS = {
     'mistral:7b': {
@@ -73,31 +75,12 @@
 JSON_PATTERN = r'{"result": \[[^\]]*\]}'
 
 
-@dataclass
-class HuggingFaceLLM:
-    """Represents HuggingFace Inference Endpoint, it is used for convenience in performance of evaluation."""
-    url: str
-    key: str
-
-    def __post_init__(self):
-        self.headers = {"Authorization": f"Bearer {self.key}", "Content-Type": "application/json"}
-
-    def __query(self, payload):
-        response = requests.post(self.url, headers=self.headers, json={'inputs': payload})
-        response.raise_for_status()
-        return response.json()
-
-    def query(self, messages: list):
-        prompt = '\n'.join([msg['content'] for msg in messages])
-        return self.__query(prompt)
-
-
 @dataclass
 class Metric(ABC):
     """Represents a RAG evaluation metric using LLM-as-a-judge paradigm"""
     system_prompt: str
     user_prompt: str
-    llm_provider: HuggingFaceLLM
+    llm: LLM
 
     @abstractmethod
     def compute(self, *args, **kwargs) -> float:
@@ -105,9 +88,12 @@ def compute(self, *args, **kwargs) -> float:
         pass
 
     @staticmethod
-    def extract_response(response: list):
+    def extract_response(response):
         """Extracts the json results from a HuggingFace Inference Endpoint response"""
-        eval_json = response[0]['generated_text'].split('\n')[-1]
+        print(response)
+        eval_json = response['message']['content']
+        # TODO: check
+        # [0]['generated_text'].split('\n')[-1]
 
         try:
             return np.mean(json.loads(eval_json)['result'])
@@ -129,7 +115,7 @@ def compute(self, answer: str, context: str):
             {'role': 'user', 'content': self.user_prompt.format(answer=answer, context=context)}
         ]
 
-        result = self.llm_provider.query(messages)
+        result = self.llm.query(messages)
         return self.extract_response(result)
 
 
@@ -143,6 +129,6 @@ def compute(self, question: str, answer: str, context: str):
             {'role': 'user', 'content': self.user_prompt.format(question=question, answer=answer, context=context)}
         ]
 
-        result = self.llm_provider.query(messages)
+        result = self.llm.query(messages)
         return self.extract_response(result)