From 626eb8d02645d76f3aac410a7dcff64cbbf729b9 Mon Sep 17 00:00:00 2001 From: msingh-openai <168678187+msingh-openai@users.noreply.github.com> Date: Wed, 5 Jun 2024 16:13:29 -0700 Subject: [PATCH] Msingh openai evalcookbook update (#1239) --- .../Getting_Started_with_OpenAI_Evals.ipynb | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/examples/evaluation/Getting_Started_with_OpenAI_Evals.ipynb b/examples/evaluation/Getting_Started_with_OpenAI_Evals.ipynb index b610647..f493038 100644 --- a/examples/evaluation/Getting_Started_with_OpenAI_Evals.ipynb +++ b/examples/evaluation/Getting_Started_with_OpenAI_Evals.ipynb @@ -312,18 +312,18 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2024-03-18T07:23:04.716044Z", - "start_time": "2024-03-18T07:23:04.708437Z" + "end_time": "2024-06-05T20:59:19.220486Z", + "start_time": "2024-06-05T20:59:19.215426Z" } }, "outputs": [ { "data": { "text/plain": [ - "'\\nspider-sql:\\n id: spider-sql.dev.v0\\n metrics: [accuracy]\\n description: Eval that scores SQL code from 194 examples in the Spider Text-to-SQL test dataset. The problems are selected by taking the first 10 problems for each database that appears in the test set.\\n Yu, Tao, et al. \"Spider; A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task.\" Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, 2018, https://doi.org/10.18653/v1/d18-1425.\\n disclaimer: Problems are solved zero-shot with no prompting other than the schema; performance may improve with training examples, fine tuning, or a different schema format. Evaluation is currently done through model-grading, where SQL code is not actually executed; the model may judge correct SQL to be incorrect, or vice-versa.\\n\\n '" + "'\\nspider-sql:\\n id: spider-sql.dev.v0\\n metrics: [accuracy]\\n description: Eval that scores SQL code from 194 examples in the Spider Text-to-SQL test dataset. The problems are selected by taking the first 10 problems for each database that appears in the test set.\\n Yu, Tao, et al. \"Spider; A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task.\" Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, 2018, https://doi.org/10.18653/v1/d18-1425.\\n disclaimer: Problems are solved zero-shot with no prompting other than the schema; performance may improve with training examples, fine tuning, or a different schema format. Evaluation is currently done through model-grading, where SQL code is not actually executed; the model may judge correct SQL to be incorrect, or vice-versa.\\nspider-sql.dev.v0:\\n class: evals.elsuite.modelgraded.classify:ModelBasedClassify\\n args:\\n samples_jsonl: sql/spider_sql.jsonl\\n eval_type: cot_classify\\n modelgraded_spec: sql\\n '" ] }, - "execution_count": 4, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -336,7 +336,12 @@ " description: Eval that scores SQL code from 194 examples in the Spider Text-to-SQL test dataset. The problems are selected by taking the first 10 problems for each database that appears in the test set.\n", " Yu, Tao, et al. \\\"Spider; A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task.\\\" Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, 2018, https://doi.org/10.18653/v1/d18-1425.\n", " disclaimer: Problems are solved zero-shot with no prompting other than the schema; performance may improve with training examples, fine tuning, or a different schema format. Evaluation is currently done through model-grading, where SQL code is not actually executed; the model may judge correct SQL to be incorrect, or vice-versa.\n", - "\n", + "spider-sql.dev.v0:\n", + " class: evals.elsuite.modelgraded.classify:ModelBasedClassify\n", + " args:\n", + " samples_jsonl: sql/spider_sql.jsonl\n", + " eval_type: cot_classify\n", + " modelgraded_spec: sql\n", " \"\"\"\"\"" ] }, @@ -391,6 +396,8 @@ "source": [ "These CLIs can accept various flags to modify their default behavior. You can run `oaieval --help` to see a full list of CLI options. \n", "\n", + "`oaieval` will search for the `spider-sql` eval YAML file in the `evals/registry/evals` directory, following the format specified in cell 4 above. The path to the eval dataset is specified in the eval YAML file under the args: parameter as `samples_jsonl: sql/spider_sql.jsonl`, with the file content in JSONL format (as generated in step 3 above).\n", + "\n", "After running that command, you’ll see the final report of accuracy printed to the console, as well as a file path to a temporary file that contains the full report." ] }, @@ -414,7 +421,7 @@ "text": [ "[2024-03-26 19:44:39,836] [registry.py:257] Loading registry from /Users/shyamal/.virtualenvs/openai/lib/python3.11/site-packages/evals/registry/evals\n", "[2024-03-26 19:44:43,623] [registry.py:257] Loading registry from /Users/shyamal/.evals/evals\n", - "[2024-03-26 19:44:43,635] [oaieval.py:189] \u001b[1;35mRun started: 240327024443FACXGMKA\u001b[0m\n", + "[2024-03-26 19:44:43,635] [oaieval.py:189] \u001B[1;35mRun started: 240327024443FACXGMKA\u001B[0m\n", "[2024-03-26 19:44:43,663] [registry.py:257] Loading registry from /Users/shyamal/.virtualenvs/openai/lib/python3.11/site-packages/evals/registry/modelgraded\n", "[2024-03-26 19:44:43,851] [registry.py:257] Loading registry from /Users/shyamal/.evals/modelgraded\n", "[2024-03-26 19:44:43,853] [data.py:90] Fetching /Users/shyamal/.virtualenvs/openai/lib/python3.11/site-packages/evals/registry/data/sql/spider_sql.jsonl\n", @@ -502,7 +509,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2024-03-18T20:37:01.920497Z", @@ -714,7 +721,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 8, "metadata": { "collapsed": false, "jupyter": { @@ -738,7 +745,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -782,7 +789,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -808,7 +815,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -842,7 +849,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -885,7 +892,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -931,7 +938,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 14, "metadata": {}, "outputs": [ {