mirror of
https://github.com/james-m-jordan/openai-cookbook.git
synced 2025-05-09 19:32:38 +00:00
Msingh openai evalcookbook update (#1239)
This commit is contained in:
parent
c342687ac2
commit
626eb8d026
@ -312,18 +312,18 @@
|
|||||||
"execution_count": 4,
|
"execution_count": 4,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
"end_time": "2024-03-18T07:23:04.716044Z",
|
"end_time": "2024-06-05T20:59:19.220486Z",
|
||||||
"start_time": "2024-03-18T07:23:04.708437Z"
|
"start_time": "2024-06-05T20:59:19.215426Z"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"'\\nspider-sql:\\n id: spider-sql.dev.v0\\n metrics: [accuracy]\\n description: Eval that scores SQL code from 194 examples in the Spider Text-to-SQL test dataset. The problems are selected by taking the first 10 problems for each database that appears in the test set.\\n Yu, Tao, et al. \"Spider; A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task.\" Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, 2018, https://doi.org/10.18653/v1/d18-1425.\\n disclaimer: Problems are solved zero-shot with no prompting other than the schema; performance may improve with training examples, fine tuning, or a different schema format. Evaluation is currently done through model-grading, where SQL code is not actually executed; the model may judge correct SQL to be incorrect, or vice-versa.\\n\\n '"
|
"'\\nspider-sql:\\n id: spider-sql.dev.v0\\n metrics: [accuracy]\\n description: Eval that scores SQL code from 194 examples in the Spider Text-to-SQL test dataset. The problems are selected by taking the first 10 problems for each database that appears in the test set.\\n Yu, Tao, et al. \"Spider; A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task.\" Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, 2018, https://doi.org/10.18653/v1/d18-1425.\\n disclaimer: Problems are solved zero-shot with no prompting other than the schema; performance may improve with training examples, fine tuning, or a different schema format. Evaluation is currently done through model-grading, where SQL code is not actually executed; the model may judge correct SQL to be incorrect, or vice-versa.\\nspider-sql.dev.v0:\\n class: evals.elsuite.modelgraded.classify:ModelBasedClassify\\n args:\\n samples_jsonl: sql/spider_sql.jsonl\\n eval_type: cot_classify\\n modelgraded_spec: sql\\n '"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 4,
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -336,7 +336,12 @@
|
|||||||
" description: Eval that scores SQL code from 194 examples in the Spider Text-to-SQL test dataset. The problems are selected by taking the first 10 problems for each database that appears in the test set.\n",
|
" description: Eval that scores SQL code from 194 examples in the Spider Text-to-SQL test dataset. The problems are selected by taking the first 10 problems for each database that appears in the test set.\n",
|
||||||
" Yu, Tao, et al. \\\"Spider; A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task.\\\" Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, 2018, https://doi.org/10.18653/v1/d18-1425.\n",
|
" Yu, Tao, et al. \\\"Spider; A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task.\\\" Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, 2018, https://doi.org/10.18653/v1/d18-1425.\n",
|
||||||
" disclaimer: Problems are solved zero-shot with no prompting other than the schema; performance may improve with training examples, fine tuning, or a different schema format. Evaluation is currently done through model-grading, where SQL code is not actually executed; the model may judge correct SQL to be incorrect, or vice-versa.\n",
|
" disclaimer: Problems are solved zero-shot with no prompting other than the schema; performance may improve with training examples, fine tuning, or a different schema format. Evaluation is currently done through model-grading, where SQL code is not actually executed; the model may judge correct SQL to be incorrect, or vice-versa.\n",
|
||||||
"\n",
|
"spider-sql.dev.v0:\n",
|
||||||
|
" class: evals.elsuite.modelgraded.classify:ModelBasedClassify\n",
|
||||||
|
" args:\n",
|
||||||
|
" samples_jsonl: sql/spider_sql.jsonl\n",
|
||||||
|
" eval_type: cot_classify\n",
|
||||||
|
" modelgraded_spec: sql\n",
|
||||||
" \"\"\"\"\""
|
" \"\"\"\"\""
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -391,6 +396,8 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"These CLIs can accept various flags to modify their default behavior. You can run `oaieval --help` to see a full list of CLI options. \n",
|
"These CLIs can accept various flags to modify their default behavior. You can run `oaieval --help` to see a full list of CLI options. \n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"`oaieval` will search for the `spider-sql` eval YAML file in the `evals/registry/evals` directory, following the format specified in cell 4 above. The path to the eval dataset is specified in the eval YAML file under the args: parameter as `samples_jsonl: sql/spider_sql.jsonl`, with the file content in JSONL format (as generated in step 3 above).\n",
|
||||||
|
"\n",
|
||||||
"After running that command, you’ll see the final report of accuracy printed to the console, as well as a file path to a temporary file that contains the full report."
|
"After running that command, you’ll see the final report of accuracy printed to the console, as well as a file path to a temporary file that contains the full report."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -414,7 +421,7 @@
|
|||||||
"text": [
|
"text": [
|
||||||
"[2024-03-26 19:44:39,836] [registry.py:257] Loading registry from /Users/shyamal/.virtualenvs/openai/lib/python3.11/site-packages/evals/registry/evals\n",
|
"[2024-03-26 19:44:39,836] [registry.py:257] Loading registry from /Users/shyamal/.virtualenvs/openai/lib/python3.11/site-packages/evals/registry/evals\n",
|
||||||
"[2024-03-26 19:44:43,623] [registry.py:257] Loading registry from /Users/shyamal/.evals/evals\n",
|
"[2024-03-26 19:44:43,623] [registry.py:257] Loading registry from /Users/shyamal/.evals/evals\n",
|
||||||
"[2024-03-26 19:44:43,635] [oaieval.py:189] \u001b[1;35mRun started: 240327024443FACXGMKA\u001b[0m\n",
|
"[2024-03-26 19:44:43,635] [oaieval.py:189] \u001B[1;35mRun started: 240327024443FACXGMKA\u001B[0m\n",
|
||||||
"[2024-03-26 19:44:43,663] [registry.py:257] Loading registry from /Users/shyamal/.virtualenvs/openai/lib/python3.11/site-packages/evals/registry/modelgraded\n",
|
"[2024-03-26 19:44:43,663] [registry.py:257] Loading registry from /Users/shyamal/.virtualenvs/openai/lib/python3.11/site-packages/evals/registry/modelgraded\n",
|
||||||
"[2024-03-26 19:44:43,851] [registry.py:257] Loading registry from /Users/shyamal/.evals/modelgraded\n",
|
"[2024-03-26 19:44:43,851] [registry.py:257] Loading registry from /Users/shyamal/.evals/modelgraded\n",
|
||||||
"[2024-03-26 19:44:43,853] [data.py:90] Fetching /Users/shyamal/.virtualenvs/openai/lib/python3.11/site-packages/evals/registry/data/sql/spider_sql.jsonl\n",
|
"[2024-03-26 19:44:43,853] [data.py:90] Fetching /Users/shyamal/.virtualenvs/openai/lib/python3.11/site-packages/evals/registry/data/sql/spider_sql.jsonl\n",
|
||||||
@ -502,7 +509,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 15,
|
"execution_count": 7,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
"end_time": "2024-03-18T20:37:01.920497Z",
|
"end_time": "2024-03-18T20:37:01.920497Z",
|
||||||
@ -714,7 +721,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 16,
|
"execution_count": 8,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"jupyter": {
|
"jupyter": {
|
||||||
@ -738,7 +745,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 17,
|
"execution_count": 9,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -782,7 +789,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 18,
|
"execution_count": 10,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -808,7 +815,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 19,
|
"execution_count": 11,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -842,7 +849,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 20,
|
"execution_count": 12,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -885,7 +892,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 21,
|
"execution_count": 13,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -931,7 +938,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 22,
|
"execution_count": 14,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user