diff --git a/examples/evaluation/How_to_evaluate_LLMs_for_SQL_generation.ipynb b/examples/evaluation/How_to_evaluate_LLMs_for_SQL_generation.ipynb index de568db..9acfc05 100644 --- a/examples/evaluation/How_to_evaluate_LLMs_for_SQL_generation.ipynb +++ b/examples/evaluation/How_to_evaluate_LLMs_for_SQL_generation.ipynb @@ -33,8 +33,8 @@ "\n", "1. **[Setup](#Setup):** Install required libraries, download data consisting of SQL queries and corresponding natural language translations.\n", "2. **[Test Development](#Test-development):** Create unit tests and define evaluation metrics for the SQL generation process.\n", - "4. **[Evaluation](#Evaluation):** Conduct tests using different prompts to assess the impact on performance.\n", - "5. **[Reporting](#Report):** Compile a report that succinctly presents the performance differences observed across various tests." + "3. **[Evaluation](#Evaluation):** Conduct tests using different prompts to assess the impact on performance.\n", + "4. **[Reporting](#Report):** Compile a report that succinctly presents the performance differences observed across various tests." ] }, { @@ -259,7 +259,7 @@ "id": "19fadf67-8b2f-4e17-95df-030a36aad90b", "metadata": {}, "source": [ - "#### Prompt\n", + "#### Prompting the LLM\n", "\n", "For this demonstration purposes, we use a fairly simple prompt requesting GPT to generate a `(context, answer)` pair. `context` is the `CREATE` SQL statement, and `answer` is the `SELECT` SQL statement. We supply the natural language question as part of the prompt. We request the response to be in JSON format, so that it can be parsed easily." ] @@ -274,66 +274,37 @@ "name": "stdout", "output_type": "stream", "text": [ - "('Translate this natural language request into a JSON object containing two '\n", - " 'SQL queries. \\n'\n", - " 'The first query should be a CREATE statement for a table answering the '\n", - " \"user's request, while the second should be a SELECT query answering their \"\n", - " 'question.')\n" + "Question: How many heads of the departments are older than 56 ?\n", + "Answer: {\"create\":\"CREATE TABLE DepartmentHeads (\\n id INT PRIMARY KEY,\\n name VARCHAR(100),\\n age INT,\\n department VARCHAR(100)\\n);\",\"select\":\"SELECT COUNT(*) AS NumberOfHeadsOlderThan56 \\nFROM DepartmentHeads \\nWHERE age > 56;\"}\n" ] } ], "source": [ - "system_prompt = '''Translate this natural language request into a JSON object containing two SQL queries. \n", - "The first query should be a CREATE statement for a table answering the user's request, while the second should be a SELECT query answering their question.'''\n", - "\n", - "pprint(system_prompt)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "3a20d712", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'content': 'Translate this natural language request into a JSON object '\n", - " 'containing two SQL queries. \\n'\n", - " 'The first query should be a CREATE statement for a table '\n", - " \"answering the user's request, while the second should be a \"\n", - " 'SELECT query answering their question.',\n", - " 'role': 'system'},\n", - " {'content': 'How many heads of the departments are older than 56 ?',\n", - " 'role': 'user'}]\n" - ] - } - ], - "source": [ - "# Compiling the system prompt and user question into message array\n", - "\n", - "messages = []\n", - "messages.append({\"role\": \"system\", \"content\": system_prompt})\n", - "messages.append({\"role\":\"user\", \"content\": sql_df.iloc[0]['question']})\n", - "pprint(messages)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "38b704b3-6f0e-4708-bc70-96723d69da6f", - "metadata": {}, - "outputs": [], - "source": [ - "# Sending the message array to GPT, requesting a response (ensure that you have API key loaded to Env for this step)\n", + "system_prompt = \"\"\"Translate this natural language request into a JSON\n", + "object containing two SQL queries. The first query should be a CREATE \n", + "tatement for a table answering the user's request, while the second\n", + "should be a SELECT query answering their question.\"\"\"\n", "\n", + "# Sending the message array to GPT, requesting a response (ensure that you\n", + "# have API key loaded to Env for this step)\n", "client = OpenAI()\n", - "completion = client.beta.chat.completions.parse(\n", - " model=GPT_MODEL,\n", - " messages=messages,\n", - " response_format=LLMResponse,\n", - ")" + "\n", + "def get_response(system_prompt, user_message, model=GPT_MODEL):\n", + " messages = []\n", + " messages.append({\"role\": \"system\", \"content\": system_prompt})\n", + " messages.append({\"role\": \"user\", \"content\": user_message})\n", + "\n", + " response = client.beta.chat.completions.parse(\n", + " model=GPT_MODEL,\n", + " messages=messages,\n", + " response_format=LLMResponse,\n", + " )\n", + " return response.choices[0].message.content\n", + "\n", + "question = sql_df.iloc[0]['question']\n", + "content = get_response(system_prompt, question)\n", + "print(\"Question:\", question)\n", + "print(\"Answer:\", content)" ] }, { @@ -345,43 +316,14 @@ "\n", "Our first simple unit test checks that the LLM response is parseable into the `LLMResponse` Pydantic class that we've defined.\n", "\n", - "We'll test that our first response passes, then create a failing example to check that the check fails. This logic will be wrapped in a simple function `test_valid_schema`." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "2b057391-4f83-4b5a-8843-a9ee74bee871", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"create\":\"CREATE TABLE department_heads (\\n id INT PRIMARY KEY,\\n name VARCHAR(100),\\n age INT\\n);\",\"select\":\"SELECT COUNT(*) FROM department_heads WHERE age > 56;\"}\n" - ] - } - ], - "source": [ - "# Viewing the output from GPT\n", - "\n", - "content = completion.choices[0].message.content\n", - "print(content)" - ] - }, - { - "cell_type": "markdown", - "id": "4b98bbb4-dd17-49bc-828c-e561abf5b481", - "metadata": {}, - "source": [ - "#### Validating the output schema\n", + "We'll test that our first response passes, then create a failing example to check that the check fails. This logic will be wrapped in a simple function `test_valid_schema`.\n", "\n", "We expect GPT to respond with a valid SQL, we can validate this using LLMResponse base model. `test_valid_schema` is designed to help us validate this." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "id": "4c7133f1-74d6-43f1-9443-09a3f8308c35", "metadata": {}, "outputs": [], @@ -399,7 +341,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "id": "6a9a9128", "metadata": {}, "outputs": [ @@ -409,7 +351,7 @@ "True" ] }, - "execution_count": 10, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -430,7 +372,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "id": "a0a26690", "metadata": {}, "outputs": [ @@ -449,7 +391,7 @@ "False" ] }, - "execution_count": 11, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -486,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "id": "9cc95481", "metadata": {}, "outputs": [], @@ -529,7 +471,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "id": "c6d2573d", "metadata": {}, "outputs": [], @@ -598,7 +540,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "id": "a9266753-4646-4901-bc14-632d3bf47aaa", "metadata": {}, "outputs": [ @@ -606,12 +548,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "CREATE SQL is: CREATE TABLE department_heads (\n", + "CREATE SQL is: CREATE TABLE DepartmentHeads (\n", " id INT PRIMARY KEY,\n", " name VARCHAR(100),\n", - " age INT\n", + " age INT,\n", + " department VARCHAR(100)\n", ");\n", - "SELECT SQL is: SELECT COUNT(*) FROM department_heads WHERE age > 56;\n" + "SELECT SQL is: SELECT COUNT(*) AS NumberOfHeadsOlderThan56 \n", + "FROM DepartmentHeads \n", + "WHERE age > 56;\n" ] } ], @@ -625,7 +570,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "id": "83bc1f1b", "metadata": {}, "outputs": [ @@ -633,12 +578,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Testing create query: CREATE TABLE department_heads (\n", + "Testing create query: CREATE TABLE DepartmentHeads (\n", " id INT PRIMARY KEY,\n", " name VARCHAR(100),\n", - " age INT\n", + " age INT,\n", + " department VARCHAR(100)\n", ");\n", - "Testing select query: SELECT COUNT(*) FROM department_heads WHERE age > 56;\n", + "Testing select query: SELECT COUNT(*) AS NumberOfHeadsOlderThan56 \n", + "FROM DepartmentHeads \n", + "WHERE age > 56;\n", "Result of query: [(0,)]\n" ] }, @@ -648,7 +596,7 @@ "True" ] }, - "execution_count": 15, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -661,7 +609,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 13, "id": "589c7cc7", "metadata": {}, "outputs": [ @@ -680,7 +628,7 @@ "False" ] }, - "execution_count": 16, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -698,16 +646,16 @@ "id": "8148f820", "metadata": {}, "source": [ - "### Evaluation\n", + "### Using an LLM to evaluate relevancy\n", "\n", - "The last component is to **evaluate** whether the generated SQL actually answers the user's question. This test will be performed by `gpt-4o-mini`, and will assess how **relevant** the produced SQL query is when compared to the initial user request.\n", + "Next, we **evaluate** whether the generated SQL actually answers the user's question. This test will be performed by `gpt-4o-mini`, and will assess how **relevant** the produced SQL query is when compared to the initial user request.\n", "\n", "This is a simple example which adapts an approach outlined in the [G-Eval paper](https://arxiv.org/abs/2303.16634), and tested in one of our other [cookbooks](https://github.com/openai/openai-cookbook/blob/main/examples/evaluation/How_to_eval_abstractive_summarization.ipynb)." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 14, "id": "029c8426", "metadata": {}, "outputs": [], @@ -760,7 +708,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 15, "id": "85cfb78d", "metadata": {}, "outputs": [], @@ -791,7 +739,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 16, "id": "607ee304", "metadata": {}, "outputs": [], @@ -812,7 +760,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 17, "id": "bd1002c2", "metadata": {}, "outputs": [ @@ -849,20 +797,12 @@ }, { "cell_type": "markdown", - "id": "afe98f7a-3e88-437f-a5cd-d105969d3020", + "id": "61b68e2a", "metadata": {}, "source": [ - "## " - ] - }, - { - "cell_type": "markdown", - "id": "fe04c6c7", - "metadata": {}, - "source": [ - "## Putting it all together\n", + "## Evaluation\n", "\n", - "We'll now test these functions in combination including our unit test and evaluations to test out two system prompts.\n", + "We will test these functions in combination including our unit test and evaluations to test out two system prompts.\n", "\n", "Each iteration of input/output and scores should be stored as a **run**. Optionally you can add GPT-4 annotation within your evaluations or as a separate step to review an entire run and highlight the reasons for errors.\n", "\n", @@ -871,72 +811,23 @@ }, { "cell_type": "markdown", - "id": "61b68e2a", + "id": "3b578b00-1b27-49de-8fd1-15c00ec99729", "metadata": {}, "source": [ - "### First run - System Prompt 1\n", + "### Building the test framework\n", "\n", - "The system under test is the first system prompt as shown below. This `run` will generate responses for this system prompt and evaluate the responses using the functions we've created so far." + "We want to build a function, `test_system_prompt`, which will run our unit tests and evaluation against a given system prompt." ] }, { "cell_type": "code", - "execution_count": 21, - "id": "85c44a17", - "metadata": {}, - "outputs": [], - "source": [ - "# Set first system prompt\n", - "system_prompt = \"\"\"Translate this natural language request into a JSON object containing two SQL queries.\n", - "\n", - "The first query should be a CREATE statement for a table answering the user's request, while the second\n", - "should be a SELECT query answering their question. \n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "1244c44e", - "metadata": {}, - "outputs": [], - "source": [ - "def get_response(system_prompt,user_message,model=GPT_MODEL):\n", - " messages = []\n", - " messages.append({\"role\": \"system\", \"content\": system_prompt})\n", - " messages.append({\"role\": \"user\", \"content\": user_message})\n", - "\n", - " response = client.beta.chat.completions.parse(\n", - " model=GPT_MODEL,\n", - " messages=messages,\n", - " response_format=LLMResponse,\n", - " )\n", - " # response = client.chat.completions.create(model=GPT_MODEL,messages=messages,temperature=0,response_format=LLMResponse)\n", - " \n", - " return response.choices[0].message.content" - ] - }, - { - "cell_type": "markdown", - "id": "76c2723b-3060-400f-b6fe-c3c3c9d6907e", - "metadata": {}, - "source": [ - "#### Run the tests and evaluations\n", - "\n", - "The functions below, run unit test and evaluate responses" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "a98afa30", + "execution_count": 18, + "id": "40be5fae-4eb3-40ce-8645-613c24d5e0b4", "metadata": {}, "outputs": [], "source": [ "def execute_unit_tests(input_df, output_list, system_prompt):\n", - " \"\"\"Unit testing function that takes in a dataframe and appends test results to an output_list.\n", - "\n", - " This allows us to test multiple system prompts.\"\"\"\n", + " \"\"\"Unit testing function that takes in a dataframe and appends test results to an output_list.\"\"\"\n", "\n", " for x, y in tqdm(input_df.iterrows(), total=len(input_df)):\n", " model_response = get_response(system_prompt, y['question'])\n", @@ -961,30 +852,59 @@ " elif row['sql'] is False:\n", " return 'SQL incorrect'\n", " else:\n", - " return 'SQL correct'" + " return 'SQL correct'\n", + "\n", + "def test_system_prompt(test_df, system_prompt):\n", + " # Execute unit tests and capture results\n", + " results = []\n", + " execute_unit_tests(\n", + " input_df=test_df,\n", + " output_list=results,\n", + " system_prompt=system_prompt\n", + " )\n", + " \n", + " results_df = pd.DataFrame(results)\n", + " results_df.columns = ['question','response','format','sql']\n", + " \n", + " # Use `apply` to calculate the geval score and unit test evaluation\n", + " # for each generated response\n", + " results_df['evaluation_score'] = results_df.apply(\n", + " lambda x: get_geval_score(\n", + " RELEVANCY_SCORE_CRITERIA,\n", + " RELEVANCY_SCORE_STEPS,\n", + " x['question'],\n", + " x['response'],\n", + " 'relevancy'\n", + " ),\n", + " axis=1\n", + " )\n", + " results_df['unit_test_evaluation'] = results_df.apply(\n", + " lambda x: evaluate_row(x),\n", + " axis=1\n", + " )\n", + " return results_df" ] }, { - "cell_type": "code", - "execution_count": 24, - "id": "898e5069", + "cell_type": "markdown", + "id": "6abc2c22-d7c6-4f15-b519-60cc58ff7774", "metadata": {}, - "outputs": [], "source": [ - "# Select 50 unseen queries to test this one\n", - "test_df = sql_df.tail(50)" + "### System Prompt 1\n", + "\n", + "The system under test is the first system prompt as shown below. This `run` will generate responses for this system prompt and evaluate the responses using the functions we've created so far." ] }, { "cell_type": "code", - "execution_count": 25, - "id": "2baec278", + "execution_count": 19, + "id": "85c44a17", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5a1c63bbf32c412b8f649d06c1a28ef4", + "model_id": "4d39ec72385f4b74bed652bfa54427f8", "version_major": 2, "version_minor": 0 }, @@ -997,44 +917,17 @@ } ], "source": [ - "# Execute unit tests and capture results\n", - "results = []\n", + "system_prompt = \"\"\"Translate this natural language request into a JSON object\n", + "containing two SQL queries.\n", "\n", - "execute_unit_tests(input_df=test_df, output_list=results, system_prompt=system_prompt)" - ] - }, - { - "cell_type": "markdown", - "id": "a070a4bf-7435-4059-bf74-eb6129cbab2b", - "metadata": {}, - "source": [ - "#### Run Evaluation\n", + "The first query should be a CREATE statement for a table answering the user's\n", + "request, while the second should be a SELECT query answering their question. \n", + "\"\"\"\n", "\n", - "Now that we have generated the SQL based on system prompt 1 (run 1), we can run evaluation against the results. We use pandas `apply` function to \"apply\" evaluation to each resulting generation" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "8fe18367", - "metadata": {}, - "outputs": [], - "source": [ - "results_df = pd.DataFrame(results)\n", - "results_df.columns = ['question','response','format','sql']\n", + "# Select 50 unseen queries to test this one\n", + "test_df = sql_df.tail(50)\n", "\n", - "# Execute evaluation\n", - "results_df['evaluation_score'] = results_df.apply(\n", - " lambda x: get_geval_score(\n", - " RELEVANCY_SCORE_CRITERIA,\n", - " RELEVANCY_SCORE_STEPS,\n", - " x['question'],\n", - " x['response'],\n", - " 'relevancy'\n", - " ),\n", - " axis=1\n", - ")\n", - "results_df['unit_test_evaluation'] = results_df.apply(lambda x: evaluate_row(x),axis=1)" + "results_df = test_system_prompt(test_df, system_prompt)" ] }, { @@ -1042,14 +935,14 @@ "id": "c3dd9b04-44e2-476c-86fd-c0a261b1cbdd", "metadata": {}, "source": [ - "## Viewing unit test results and evaluations - Run 1\n", - "\n", - "We can now group the outcomes of the unit test (which test the structure of response) and evaluation (which checks if the SQL is syntatically correct)." + "We can now group the outcomes of:\n", + "* the **unit tests**, which test the structure of response; and\n", + "* the **evaluation**, which checks if the SQL is syntatically correct." ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 20, "id": "15bf55ca-63e0-42a0-9846-703549710d4d", "metadata": {}, "outputs": [ @@ -1057,12 +950,12 @@ "data": { "text/plain": [ "unit_test_evaluation\n", - "SQL correct 41\n", - "SQL incorrect 9\n", + "SQL correct 46\n", + "SQL incorrect 4\n", "Name: count, dtype: int64" ] }, - "execution_count": 27, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1073,7 +966,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 21, "id": "b3f98f81", "metadata": { "scrolled": true @@ -1083,13 +976,13 @@ "data": { "text/plain": [ "evaluation_score\n", - "5 36\n", - "4 13\n", + "5 33\n", + "4 16\n", "3 1\n", "Name: count, dtype: int64" ] }, - "execution_count": 28, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1103,47 +996,21 @@ "id": "019f3a1d", "metadata": {}, "source": [ - "### Second run\n", + "### System Prompt 2\n", "\n", - "We now use a new system prompt to run same unit test and evaluation. Please note that we are using the same functions for unit testing and evaluations; the only change is the system prompt (which is under the test)." + "We now use a new system prompt to run same unit test and evaluation." ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 22, "id": "513a2da1", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Translate this natural language request into a JSON object containing two SQL queries. \n", - "The first query should be a CREATE statement for a table answering the user's request, while the second\n", - "should be a SELECT query answering their question.\n", - "Ensure the SQL is always generated on one line, never use \\n to separate rows.\n" - ] - } - ], - "source": [ - "system_prompt_2 = \"\"\"Translate this natural language request into a JSON object containing two SQL queries. \n", - "The first query should be a CREATE statement for a table answering the user's request, while the second\n", - "should be a SELECT query answering their question.\n", - "Ensure the SQL is always generated on one line, never use \\\\n to separate rows.\"\"\"\n", - "\n", - "print(system_prompt_2)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "70bd3e32", - "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e64c52a42f1c4202a0c820f160c1504e", + "model_id": "032f5bc915c44d88a5a0aeb52e2c1bb4", "version_major": 2, "version_minor": 0 }, @@ -1156,25 +1023,16 @@ } ], "source": [ - "# Execute unit tests\n", - "results_2 = []\n", + "system_prompt_2 = \"\"\"Translate this natural language request into a JSON\n", + "object containing two SQL queries.\n", "\n", - "execute_unit_tests(input_df=test_df,output_list=results_2,system_prompt=system_prompt_2)" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "04532d59", - "metadata": {}, - "outputs": [], - "source": [ - "results_2_df = pd.DataFrame(results_2)\n", - "results_2_df.columns = ['question','response','format','sql']\n", + "The first query should be a CREATE statement for a table answering the user's\n", + "request, while the second should be a SELECT query answering their question.\n", "\n", - "# Execute evaluation\n", - "results_2_df['evaluation_score'] = results_2_df.apply(lambda x: get_geval_score(RELEVANCY_SCORE_CRITERIA,RELEVANCY_SCORE_STEPS,x['question'],x['response'],'relevancy'),axis=1)\n", - "results_2_df['unit_test_evaluation'] = results_2_df.apply(lambda x: evaluate_row(x),axis=1)" + "Ensure the SQL is always generated on one line, never use \\\\n to separate rows.\"\"\"\n", + "\n", + "\n", + "results_2_df = test_system_prompt(test_df, system_prompt)" ] }, { @@ -1182,14 +1040,12 @@ "id": "cd95c3f9-f90d-451d-a32b-aeb066906779", "metadata": {}, "source": [ - "## Viewing unit test results and evaluations - Run 2\n", - "\n", - "We can now group the outcomes of the unit test (which test the structure of response) and evaluation (which checks if the SQL is syntatically correct)." + "As above, we can group the unit test and evaluation results." ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 23, "id": "cbaa4bdf", "metadata": {}, "outputs": [ @@ -1197,12 +1053,12 @@ "data": { "text/plain": [ "unit_test_evaluation\n", - "SQL correct 49\n", - "SQL incorrect 1\n", + "SQL correct 44\n", + "SQL incorrect 6\n", "Name: count, dtype: int64" ] }, - "execution_count": 32, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1213,7 +1069,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 24, "id": "1ada474e", "metadata": { "scrolled": true @@ -1223,13 +1079,13 @@ "data": { "text/plain": [ "evaluation_score\n", - "5 36\n", - "4 13\n", + "5 34\n", + "4 15\n", "3 1\n", "Name: count, dtype: int64" ] }, - "execution_count": 33, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1243,40 +1099,16 @@ "id": "1908c933", "metadata": {}, "source": [ - "## Report\n", + "## Reporting\n", "\n", "We'll make a simple dataframe to store and display the run performance - this is where you can use tools like Weights & Biases Prompts or Gantry to store the results for analytics on your different iterations." ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 25, "id": "d277222d", "metadata": {}, - "outputs": [], - "source": [ - "results_df['run'] = 1\n", - "results_df['Evaluating Model'] = 'gpt-4'\n", - "\n", - "results_2_df['run'] = 2\n", - "results_2_df['Evaluating Model'] = 'gpt-4'" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "6da35c99", - "metadata": {}, - "outputs": [], - "source": [ - "run_df = pd.concat([results_df,results_2_df])" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "4116cb37", - "metadata": {}, "outputs": [ { "data": { @@ -1324,7 +1156,7 @@ " \n", " 1\n", " What venue did the partnership of herschelle g...\n", - " {\"create\":\"CREATE TABLE cricket_partnerships (...\n", + " {\"create\":\"CREATE TABLE CricketPartnerships (\\...\n", " True\n", " True\n", " 5\n", @@ -1335,18 +1167,18 @@ " \n", " 2\n", " What is the number Played that has 310 Points ...\n", - " {\"create\":\"CREATE TABLE Scores (\\n id INT P...\n", + " {\"create\":\"CREATE TABLE game_stats (\\n numb...\n", + " True\n", " True\n", - " False\n", " 5\n", - " SQL incorrect\n", + " SQL correct\n", " 1\n", " gpt-4\n", " \n", " \n", " 3\n", " What Losing bonus has a Points against of 588?\n", - " {\"create\":\"CREATE TABLE FootballTeams (\\n T...\n", + " {\"create\":\"CREATE TABLE BonusInfo (\\n id IN...\n", " True\n", " True\n", " 5\n", @@ -1357,10 +1189,10 @@ " \n", " 4\n", " What Tries against has a Losing bonus of 7?\n", - " {\"create\":\"CREATE TABLE RugbyScores (\\n tea...\n", + " {\"create\":\"CREATE TABLE matches (\\n id SERI...\n", " True\n", " True\n", - " 4\n", + " 5\n", " SQL correct\n", " 1\n", " gpt-4\n", @@ -1377,33 +1209,50 @@ "3 What Losing bonus has a Points against of 588? \n", "4 What Tries against has a Losing bonus of 7? \n", "\n", - " response format sql \\\n", - "0 {\"create\":\"CREATE TABLE cricket_partnerships (... True True \n", - "1 {\"create\":\"CREATE TABLE cricket_partnerships (... True True \n", - "2 {\"create\":\"CREATE TABLE Scores (\\n id INT P... True False \n", - "3 {\"create\":\"CREATE TABLE FootballTeams (\\n T... True True \n", - "4 {\"create\":\"CREATE TABLE RugbyScores (\\n tea... True True \n", + " response format sql \\\n", + "0 {\"create\":\"CREATE TABLE cricket_partnerships (... True True \n", + "1 {\"create\":\"CREATE TABLE CricketPartnerships (\\... True True \n", + "2 {\"create\":\"CREATE TABLE game_stats (\\n numb... True True \n", + "3 {\"create\":\"CREATE TABLE BonusInfo (\\n id IN... True True \n", + "4 {\"create\":\"CREATE TABLE matches (\\n id SERI... True True \n", "\n", " evaluation_score unit_test_evaluation run Evaluating Model \n", "0 5 SQL correct 1 gpt-4 \n", "1 5 SQL correct 1 gpt-4 \n", - "2 5 SQL incorrect 1 gpt-4 \n", + "2 5 SQL correct 1 gpt-4 \n", "3 5 SQL correct 1 gpt-4 \n", - "4 4 SQL correct 1 gpt-4 " + "4 5 SQL correct 1 gpt-4 " ] }, - "execution_count": 36, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "results_df['run'] = 1\n", + "results_df['Evaluating Model'] = 'gpt-4'\n", + "\n", + "results_2_df['run'] = 2\n", + "results_2_df['Evaluating Model'] = 'gpt-4'\n", + "\n", + "run_df = pd.concat([results_df,results_2_df])\n", "run_df.head()" ] }, + { + "cell_type": "markdown", + "id": "0162a009-fc43-484c-90f6-d59a8e52f365", + "metadata": {}, + "source": [ + "#### Plotting unit test results\n", + "\n", + "We can create a simple bar chart to visualise the results of unit tests for both runs." + ] + }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 26, "id": "ed800f0c", "metadata": {}, "outputs": [ @@ -1441,20 +1290,20 @@ " \n", " 1\n", " SQL correct\n", - " 41\n", + " 46\n", " \n", " \n", " SQL incorrect\n", - " 9\n", + " 4\n", " \n", " \n", " 2\n", " SQL correct\n", - " 49\n", + " 44\n", " \n", " \n", " SQL incorrect\n", - " 1\n", + " 6\n", " \n", " \n", "\n", @@ -1463,44 +1312,37 @@ "text/plain": [ " Number of records\n", "run unit_test_evaluation \n", - "1 SQL correct 41\n", - " SQL incorrect 9\n", - "2 SQL correct 49\n", - " SQL incorrect 1" + "1 SQL correct 46\n", + " SQL incorrect 4\n", + "2 SQL correct 44\n", + " SQL incorrect 6" ] }, - "execution_count": 37, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Unit test results\n", - "unittest_df_pivot = pd.pivot_table(run_df, values='format',index=['run','unit_test_evaluation'], #columns='position',\n", - " aggfunc='count')\n", + "unittest_df_pivot = pd.pivot_table(\n", + " run_df,\n", + " values='format',\n", + " index=['run','unit_test_evaluation'],\n", + " aggfunc='count'\n", + ")\n", "unittest_df_pivot.columns = ['Number of records']\n", "unittest_df_pivot" ] }, - { - "cell_type": "markdown", - "id": "0162a009-fc43-484c-90f6-d59a8e52f365", - "metadata": {}, - "source": [ - "#### Plotting the results\n", - "\n", - "We can create a simple bar chart to visualise the results of unit tests for both runs." - ] - }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 27, "id": "e2b4aa03-42f5-4c30-a610-e553937bf160", "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1547,9 +1389,19 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "id": "786515fa-6841-4820-98f9-aa29ae76cf76", + "metadata": {}, + "source": [ + "#### Plotting evaluation results\n", + "\n", + "We can similarly plot the results of the evaluation." + ] + }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 28, "id": "7228eac7-e0a9-473d-9432-e558bbc91841", "metadata": {}, "outputs": [ @@ -1591,11 +1443,11 @@ " \n", " \n", " 4\n", - " 13\n", + " 16\n", " \n", " \n", " 5\n", - " 36\n", + " 33\n", " \n", " \n", " 2\n", @@ -1604,11 +1456,11 @@ " \n", " \n", " 4\n", - " 13\n", + " 15\n", " \n", " \n", " 5\n", - " 36\n", + " 34\n", " \n", " \n", "\n", @@ -1618,45 +1470,38 @@ " Number of records\n", "run evaluation_score \n", "1 3 1\n", - " 4 13\n", - " 5 36\n", + " 4 16\n", + " 5 33\n", "2 3 1\n", - " 4 13\n", - " 5 36" + " 4 15\n", + " 5 34" ] }, - "execution_count": 39, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Unit test results\n", - "evaluation_df_pivot = pd.pivot_table(run_df, values='format',index=['run','evaluation_score'], #columns='position',\n", - " aggfunc='count')\n", + "evaluation_df_pivot = pd.pivot_table(\n", + " run_df,\n", + " values='format',\n", + " index=['run','evaluation_score'],\n", + " aggfunc='count'\n", + ")\n", "evaluation_df_pivot.columns = ['Number of records']\n", "evaluation_df_pivot" ] }, - { - "cell_type": "markdown", - "id": "786515fa-6841-4820-98f9-aa29ae76cf76", - "metadata": {}, - "source": [ - "#### Plotting the results\n", - "\n", - "We can create a simple bar chart to visualise the results of unit tests for both runs." - ] - }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 29, "id": "b2a18a78-55ec-43f6-9d62-929707a94364", "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1724,6 +1569,14 @@ "\n", "We hope you find this useful, and please supply any feedback." ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8368c786-38eb-4ca3-b5f4-cad63fec87bd", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {