mirror of
https://github.com/james-m-jordan/openai-cookbook.git
synced 2025-05-09 19:32:38 +00:00
Update model for hyde/chroma cookbook (#1785)
This commit is contained in:
parent
8e89464703
commit
d97f1078c1
@ -292,3 +292,8 @@ rzhao-openai:
|
||||
name: "Randy Zhao"
|
||||
website: "https://www.linkedin.com/in/randy-zhao-27433616b"
|
||||
avatar: "https://avatars.githubusercontent.com/u/208724779?v=4"
|
||||
|
||||
brandonbaker-openai:
|
||||
name: "Brandon Baker"
|
||||
website: "https://www.linkedin.com/in/brandonbaker18"
|
||||
avatar: "https://avatars.githubusercontent.com/u/208719822"
|
||||
|
@ -34,13 +34,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
@ -60,31 +64,43 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"OPENAI_API_KEY is ready\n"
|
||||
"OpenAI client is ready\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"# Uncomment the following line to set the environment variable in the notebook\n",
|
||||
"# os.environ[\"OPENAI_API_KEY\"] = 'sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'\n",
|
||||
"\n",
|
||||
"if os.getenv(\"OPENAI_API_KEY\") is not None:\n",
|
||||
" print(\"OPENAI_API_KEY is ready\")\n",
|
||||
" import openai\n",
|
||||
" openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
|
||||
"api_key = os.getenv(\"OPENAI_API_KEY\")\n",
|
||||
"\n",
|
||||
"if api_key:\n",
|
||||
" client = OpenAI(api_key=api_key)\n",
|
||||
" print(\"OpenAI client is ready\")\n",
|
||||
"else:\n",
|
||||
" print(\"OPENAI_API_KEY environment variable not found\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Set the model for all API calls\n",
|
||||
"OPENAI_MODEL = \"gpt-4o\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
@ -206,7 +222,7 @@
|
||||
"source": [
|
||||
"# Just asking the model\n",
|
||||
"\n",
|
||||
"GPT-3.5 was trained on a large amount of scientific information. As a baseline, we'd like to understand what the model already knows without any further context. This will allow us to calibrate overall performance. \n",
|
||||
"ChatGPT was trained on a large amount of scientific information. As a baseline, we'd like to understand what the model already knows without any further context. This will allow us to calibrate overall performance. \n",
|
||||
"\n",
|
||||
"We construct an appropriate prompt, with some example facts, then query the model with each claim in the dataset. We ask the model to assess a claim as 'True', 'False', or 'NEE' if there is not enough evidence one way or the other. "
|
||||
]
|
||||
@ -255,8 +271,8 @@
|
||||
" responses = []\n",
|
||||
" # Query the OpenAI API\n",
|
||||
" for claim in claims:\n",
|
||||
" response = openai.ChatCompletion.create(\n",
|
||||
" model='gpt-3.5-turbo',\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" model=OPENAI_MODEL,\n",
|
||||
" messages=build_prompt(claim),\n",
|
||||
" max_tokens=3,\n",
|
||||
" )\n",
|
||||
@ -270,16 +286,16 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We sample 100 claims from the dataset"
|
||||
"We sample 50 claims from the dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Let's take a look at 100 claims\n",
|
||||
"# Let's take a look at 50 claims\n",
|
||||
"samples = claim_df.sample(50)\n",
|
||||
"\n",
|
||||
"claims = samples['claim'].tolist() \n"
|
||||
@ -294,7 +310,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -315,7 +331,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -332,7 +348,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -367,7 +383,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -376,20 +392,20 @@
|
||||
"text": [
|
||||
"\tGroundtruth\n",
|
||||
"\tTrue\tFalse\tNEE\n",
|
||||
"True\t15\t5\t14\t\n",
|
||||
"False\t0\t2\t1\t\n",
|
||||
"NEE\t3\t3\t7\t\n"
|
||||
"True\t12\t4\t16\t\n",
|
||||
"False\t0\t4\t3\t\n",
|
||||
"NEE\t6\t2\t3\t\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'True': {'True': 15, 'False': 5, 'NEE': 14},\n",
|
||||
" 'False': {'True': 0, 'False': 2, 'NEE': 1},\n",
|
||||
" 'NEE': {'True': 3, 'False': 3, 'NEE': 7}}"
|
||||
"{'True': {'True': 12, 'False': 4, 'NEE': 16},\n",
|
||||
" 'False': {'True': 0, 'False': 4, 'NEE': 3},\n",
|
||||
" 'NEE': {'True': 6, 'False': 2, 'NEE': 3}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 26,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -427,7 +443,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -513,7 +529,7 @@
|
||||
"4 [Two human Golli (for gene expressed in the ol... False "
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -545,18 +561,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Running Chroma using direct local API.\n",
|
||||
"Using DuckDB in-memory for database. Data will be transient.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import chromadb\n",
|
||||
"from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction\n",
|
||||
@ -577,7 +584,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -603,7 +610,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -619,7 +626,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -647,8 +654,8 @@
|
||||
" if len(context) == 0:\n",
|
||||
" responses.append('NEE')\n",
|
||||
" continue\n",
|
||||
" response = openai.ChatCompletion.create(\n",
|
||||
" model='gpt-3.5-turbo',\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" model=OPENAI_MODEL,\n",
|
||||
" messages=build_prompt_with_context(claim=claim, context=context),\n",
|
||||
" max_tokens=3,\n",
|
||||
" )\n",
|
||||
@ -667,7 +674,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -676,20 +683,20 @@
|
||||
"text": [
|
||||
"\tGroundtruth\n",
|
||||
"\tTrue\tFalse\tNEE\n",
|
||||
"True\t16\t2\t8\t\n",
|
||||
"False\t1\t6\t5\t\n",
|
||||
"NEE\t1\t2\t9\t\n"
|
||||
"True\t13\t0\t3\t\n",
|
||||
"False\t0\t9\t3\t\n",
|
||||
"NEE\t5\t1\t16\t\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'True': {'True': 16, 'False': 2, 'NEE': 8},\n",
|
||||
" 'False': {'True': 1, 'False': 6, 'NEE': 5},\n",
|
||||
" 'NEE': {'True': 1, 'False': 2, 'NEE': 9}}"
|
||||
"{'True': {'True': 13, 'False': 0, 'NEE': 3},\n",
|
||||
" 'False': {'True': 0, 'False': 9, 'NEE': 3},\n",
|
||||
" 'NEE': {'True': 5, 'False': 1, 'NEE': 16}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 28,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -706,7 +713,7 @@
|
||||
"source": [
|
||||
"## Results\n",
|
||||
"\n",
|
||||
"We see that the model is a lot less likely to evaluate a False claim as true (2 instances VS 5 previously), but that claims without enough evidence are still often assessed as True or False.\n",
|
||||
"We see that the model performs better overall, and is now significantly better at correctly identifying false claims. Additionally, most NEE cases are also correctly identified now.\n",
|
||||
"\n",
|
||||
"Taking a look at the retrieved documents, we see that they are sometimes not relevant to the claim - this causes the model to be confused by the extra information, and it may decide that sufficient evidence is present, even when the information is irrelevant. This happens because we always ask for the 3 'most' relevant documents, but these might not be relevant at all beyond a certain point. "
|
||||
]
|
||||
@ -742,7 +749,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -758,7 +765,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -767,20 +774,20 @@
|
||||
"text": [
|
||||
"\tGroundtruth\n",
|
||||
"\tTrue\tFalse\tNEE\n",
|
||||
"True\t10\t2\t1\t\n",
|
||||
"False\t0\t2\t1\t\n",
|
||||
"NEE\t8\t6\t20\t\n"
|
||||
"True\t6\t0\t3\t\n",
|
||||
"False\t0\t3\t0\t\n",
|
||||
"NEE\t12\t7\t19\t\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'True': {'True': 10, 'False': 2, 'NEE': 1},\n",
|
||||
" 'False': {'True': 0, 'False': 2, 'NEE': 1},\n",
|
||||
" 'NEE': {'True': 8, 'False': 6, 'NEE': 20}}"
|
||||
"{'True': {'True': 6, 'False': 0, 'NEE': 3},\n",
|
||||
" 'False': {'True': 0, 'False': 3, 'NEE': 0},\n",
|
||||
" 'NEE': {'True': 12, 'False': 7, 'NEE': 19}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 30,
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -797,7 +804,8 @@
|
||||
"source": [
|
||||
"## Results\n",
|
||||
"\n",
|
||||
"The model now assesses many fewer claims as True or False when there is not enough evidence present. However, it now biases away from certainty. Most claims are now assessed as having not enough evidence, because a large fraction of them are filtered out by the distance threshold. It's possible to tune the distance threshold to find the optimal operating point, but this can be difficult, and is dataset and embedding model dependent. "
|
||||
"\n",
|
||||
"The model now assesses many fewer claims as True or False when there is not enough evidence present. However, it also is now much more cautious, tending to label most items as not enough evidence, biasing away from certainty. Most claims are now assessed as having not enough evidence, because a large fraction of them are filtered out by the distance threshold. It's possible to tune the distance threshold to find the optimal operating point, but this can be difficult, and is dataset and embedding model dependent. "
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -859,12 +867,11 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"def hallucinate_evidence(claims):\n",
|
||||
" # Query the OpenAI API\n",
|
||||
" responses = []\n",
|
||||
" # Query the OpenAI API\n",
|
||||
" for claim in claims:\n",
|
||||
" response = openai.ChatCompletion.create(\n",
|
||||
" model='gpt-3.5-turbo',\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" model=OPENAI_MODEL,\n",
|
||||
" messages=build_hallucination_prompt(claim),\n",
|
||||
" )\n",
|
||||
" responses.append(response.choices[0].message.content)\n",
|
||||
@ -877,12 +884,12 @@
|
||||
"source": [
|
||||
"We hallucinate a document for each claim.\n",
|
||||
"\n",
|
||||
"*NB: This can take a while, about 30m for 100 claims*. You can reduce the number of claims we want to assess to get results more quickly. "
|
||||
"*NB: This can take a while, about 7m for 100 claims*. You can reduce the number of claims we want to assess to get results more quickly. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -898,7 +905,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -915,7 +922,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -924,20 +931,20 @@
|
||||
"text": [
|
||||
"\tGroundtruth\n",
|
||||
"\tTrue\tFalse\tNEE\n",
|
||||
"True\t15\t2\t5\t\n",
|
||||
"False\t1\t5\t4\t\n",
|
||||
"NEE\t2\t3\t13\t\n"
|
||||
"True\t11\t0\t5\t\n",
|
||||
"False\t0\t8\t1\t\n",
|
||||
"NEE\t7\t2\t16\t\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'True': {'True': 15, 'False': 2, 'NEE': 5},\n",
|
||||
" 'False': {'True': 1, 'False': 5, 'NEE': 4},\n",
|
||||
" 'NEE': {'True': 2, 'False': 3, 'NEE': 13}}"
|
||||
"{'True': {'True': 11, 'False': 0, 'NEE': 5},\n",
|
||||
" 'False': {'True': 0, 'False': 8, 'NEE': 1},\n",
|
||||
" 'NEE': {'True': 7, 'False': 2, 'NEE': 16}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 33,
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -969,7 +976,7 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@ -983,12 +990,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "fd16a328ca3d68029457069b79cb0b38eb39a0f5ccc4fe4473d3047707df8207"
|
||||
}
|
||||
"version": "3.12.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -691,9 +691,10 @@
|
||||
|
||||
- title: Robust question answering with Chroma and OpenAI
|
||||
path: examples/vector_databases/chroma/hyde-with-chroma-and-openai.ipynb
|
||||
date: 2023-04-06
|
||||
date: 2025-04-23
|
||||
authors:
|
||||
- atroyn
|
||||
- brandonbaker
|
||||
tags:
|
||||
- embeddings
|
||||
- completions
|
||||
|
Loading…
x
Reference in New Issue
Block a user