mirror of
https://github.com/james-m-jordan/openai-cookbook.git
synced 2025-05-09 19:32:38 +00:00
updates embedding examples based on ada-002
This commit is contained in:
parent
502429c7c8
commit
8b547fdb83
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -29,8 +29,7 @@
|
|||||||
"import openai\n",
|
"import openai\n",
|
||||||
"\n",
|
"\n",
|
||||||
"embedding = openai.Embedding.create(\n",
|
"embedding = openai.Embedding.create(\n",
|
||||||
" input=\"Your text goes here\",\n",
|
" input=\"Your text goes here\", model=\"text-embedding-ada-002\"\n",
|
||||||
" engine=\"text-embedding-ada-002\"\n",
|
|
||||||
")[\"data\"][0][\"embedding\"]\n",
|
")[\"data\"][0][\"embedding\"]\n",
|
||||||
"len(embedding)\n"
|
"len(embedding)\n"
|
||||||
]
|
]
|
||||||
@ -54,15 +53,11 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))\n",
|
"@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))\n",
|
||||||
"def get_embedding(text: str, engine=\"text-embedding-ada-002\") -> list[float]:\n",
|
"def get_embedding(text: str, model=\"text-embedding-ada-002\") -> list[float]:\n",
|
||||||
"\n",
|
" return openai.Embedding.create(input=[text], model=model)[\"data\"][0][\"embedding\"]\n",
|
||||||
" # replace newlines, which can negatively affect performance.\n",
|
|
||||||
" text = text.replace(\"\\n\", \" \")\n",
|
|
||||||
"\n",
|
|
||||||
" return openai.Embedding.create(input=[text], engine=engine)[\"data\"][0][\"embedding\"]\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"embedding = get_embedding(\"Your text goes here\", engine=\"text-embedding-ada-002\")\n",
|
"embedding = get_embedding(\"Your text goes here\", model=\"text-embedding-ada-002\")\n",
|
||||||
"print(len(embedding))\n"
|
"print(len(embedding))\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -21,7 +21,32 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# imports\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import tiktoken\n",
|
||||||
|
"\n",
|
||||||
|
"from openai.embeddings_utils import get_embedding\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# embedding model parameters\n",
|
||||||
|
"embedding_model = \"text-embedding-ada-002\"\n",
|
||||||
|
"embedding_encoding = \"cl100k_base\" # this the encoding for text-embedding-ada-002\n",
|
||||||
|
"max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -97,25 +122,26 @@
|
|||||||
"1 Title: Arrived in pieces; Content: Not pleased... "
|
"1 Title: Arrived in pieces; Content: Not pleased... "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 1,
|
"execution_count": 8,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import pandas as pd\n",
|
"# load & inspect dataset\n",
|
||||||
"\n",
|
"input_datapath = \"data/fine_food_reviews_1k.csv\" # to save space, we provide a pre-filtered dataset\n",
|
||||||
"input_datapath = 'data/fine_food_reviews_1k.csv' # to save space, we provide a pre-filtered dataset\n",
|
|
||||||
"df = pd.read_csv(input_datapath, index_col=0)\n",
|
"df = pd.read_csv(input_datapath, index_col=0)\n",
|
||||||
"df = df[['Time', 'ProductId', 'UserId', 'Score', 'Summary', 'Text']]\n",
|
"df = df[[\"Time\", \"ProductId\", \"UserId\", \"Score\", \"Summary\", \"Text\"]]\n",
|
||||||
"df = df.dropna()\n",
|
"df = df.dropna()\n",
|
||||||
"df['combined'] = \"Title: \" + df.Summary.str.strip() + \"; Content: \" + df.Text.str.strip()\n",
|
"df[\"combined\"] = (\n",
|
||||||
"df.head(2)"
|
" \"Title: \" + df.Summary.str.strip() + \"; Content: \" + df.Text.str.strip()\n",
|
||||||
|
")\n",
|
||||||
|
"df.head(2)\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 9,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -124,54 +150,52 @@
|
|||||||
"1000"
|
"1000"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 2,
|
"execution_count": 9,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# subsample to 1k most recent reviews and remove samples that are too long\n",
|
"# subsample to 1k most recent reviews and remove samples that are too long\n",
|
||||||
"df = df.sort_values('Time').tail(1_100)\n",
|
"top_n = 1000\n",
|
||||||
"df.drop('Time', axis=1, inplace=True)\n",
|
"df = df.sort_values(\"Time\").tail(top_n * 2) # first cut to first 2k entries, assuming less than half will be filtered out\n",
|
||||||
|
"df.drop(\"Time\", axis=1, inplace=True)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from transformers import GPT2TokenizerFast\n",
|
"encoding = tiktoken.get_encoding(embedding_encoding)\n",
|
||||||
"tokenizer = GPT2TokenizerFast.from_pretrained(\"gpt2\")\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# remove reviews that are too long\n",
|
"# omit reviews that are too long to embed\n",
|
||||||
"df['n_tokens'] = df.combined.apply(lambda x: len(tokenizer.encode(x)))\n",
|
"df[\"n_tokens\"] = df.combined.apply(lambda x: len(encoding.encode(x)))\n",
|
||||||
"df = df[df.n_tokens<8000].tail(1_000)\n",
|
"df = df[df.n_tokens <= max_tokens].tail(top_n)\n",
|
||||||
"len(df)"
|
"len(df)\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### 2. Get embeddings and save them for future reuse"
|
"## 2. Get embeddings and save them for future reuse"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 10,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import openai\n",
|
|
||||||
"from openai.embeddings_utils import get_embedding\n",
|
|
||||||
"# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage\n",
|
"# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# This will take just between 5 and 10 minutes\n",
|
"# This may take a few minutes\n",
|
||||||
"df['ada_similarity'] = df.combined.apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))\n",
|
"df[\"embedding\"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))\n",
|
||||||
"df['ada_search'] = df['ada_similarity']\n",
|
"df.to_csv(\"data/fine_food_reviews_with_embeddings_1k.csv\")\n"
|
||||||
"df.to_csv('data/fine_food_reviews_with_embeddings_1k.csv')"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "openai-cookbook",
|
"display_name": "openai",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "openai-cookbook"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
"language_info": {
|
"language_info": {
|
||||||
"codemirror_mode": {
|
"codemirror_mode": {
|
||||||
@ -183,12 +207,12 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.6"
|
"version": "3.9.9 (main, Dec 7 2021, 18:04:56) \n[Clang 13.0.0 (clang-1300.0.29.3)]"
|
||||||
},
|
},
|
||||||
"orig_nbformat": 4,
|
"orig_nbformat": 4,
|
||||||
"vscode": {
|
"vscode": {
|
||||||
"interpreter": {
|
"interpreter": {
|
||||||
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
|
"hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -20,7 +20,7 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"Ada similarity embedding performance on 1k Amazon reviews: mse=0.60, mae=0.51\n"
|
"ada-002 embedding performance on 1k Amazon reviews: mse=0.62, mae=0.53\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -32,13 +32,12 @@
|
|||||||
"from sklearn.model_selection import train_test_split\n",
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
"from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
|
"from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# If you have not run the \"Obtain_dataset.ipynb\" notebook, you can download the datafile from here: https://cdn.openai.com/API/examples/data/fine_food_reviews_with_embeddings_1k.csv\n",
|
"datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
|
||||||
"datafile_path = \"./data/fine_food_reviews_with_embeddings_1k.csv\"\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"df = pd.read_csv(datafile_path)\n",
|
"df = pd.read_csv(datafile_path)\n",
|
||||||
"df[\"ada_similarity\"] = df.ada_similarity.apply(eval).apply(np.array)\n",
|
"df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"X_train, X_test, y_train, y_test = train_test_split(list(df.ada_similarity.values), df.Score, test_size=0.2, random_state=42)\n",
|
"X_train, X_test, y_train, y_test = train_test_split(list(df.embedding.values), df.Score, test_size=0.2, random_state=42)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"rfr = RandomForestRegressor(n_estimators=100)\n",
|
"rfr = RandomForestRegressor(n_estimators=100)\n",
|
||||||
"rfr.fit(X_train, y_train)\n",
|
"rfr.fit(X_train, y_train)\n",
|
||||||
@ -47,7 +46,7 @@
|
|||||||
"mse = mean_squared_error(y_test, preds)\n",
|
"mse = mean_squared_error(y_test, preds)\n",
|
||||||
"mae = mean_absolute_error(y_test, preds)\n",
|
"mae = mean_absolute_error(y_test, preds)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(f\"Ada similarity embedding performance on 1k Amazon reviews: mse={mse:.2f}, mae={mae:.2f}\")\n"
|
"print(f\"ada-002 embedding performance on 1k Amazon reviews: mse={mse:.2f}, mae={mae:.2f}\")\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -76,7 +75,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"We can see that the embeddings are able to predict the scores with an average error of 0.60 per score prediction. This is roughly equivalent to predicting 1 out of 3 reviews perfectly, and 1 out of two reviews by a one star error."
|
"We can see that the embeddings are able to predict the scores with an average error of 0.53 per score prediction. This is roughly equivalent to predicting half of reviews perfectly, and half off by one star."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -89,9 +88,9 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "openai-cookbook",
|
"display_name": "openai",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "openai-cookbook"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
"language_info": {
|
"language_info": {
|
||||||
"codemirror_mode": {
|
"codemirror_mode": {
|
||||||
@ -103,7 +102,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.6"
|
"version": "3.9.9"
|
||||||
},
|
},
|
||||||
"orig_nbformat": 4,
|
"orig_nbformat": 4,
|
||||||
"vscode": {
|
"vscode": {
|
||||||
|
@ -18,11 +18,10 @@
|
|||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# If you have not run the \"Obtain_dataset.ipynb\" notebook, you can download the datafile from here: https://cdn.openai.com/API/examples/data/fine_food_reviews_with_embeddings_1k.csv\n",
|
"datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
|
||||||
"datafile_path = \"./data/fine_food_reviews_with_embeddings_1k.csv\"\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"df = pd.read_csv(datafile_path)\n",
|
"df = pd.read_csv(datafile_path)\n",
|
||||||
"df[\"ada_search\"] = df.ada_search.apply(eval).apply(np.array)\n"
|
"df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -55,26 +54,26 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"# search through the reviews for a specific product\n",
|
"# search through the reviews for a specific product\n",
|
||||||
"def search_reviews(df, product_description, n=3, pprint=True):\n",
|
"def search_reviews(df, product_description, n=3, pprint=True):\n",
|
||||||
" embedding = get_embedding(\n",
|
" product_embedding = get_embedding(\n",
|
||||||
" product_description,\n",
|
" product_description,\n",
|
||||||
" engine=\"text-embedding-ada-002\"\n",
|
" engine=\"text-embedding-ada-002\"\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" df[\"similarities\"] = df.ada_search.apply(lambda x: cosine_similarity(x, embedding))\n",
|
" df[\"similarity\"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))\n",
|
||||||
"\n",
|
"\n",
|
||||||
" res = (\n",
|
" results = (\n",
|
||||||
" df.sort_values(\"similarities\", ascending=False)\n",
|
" df.sort_values(\"similarity\", ascending=False)\n",
|
||||||
" .head(n)\n",
|
" .head(n)\n",
|
||||||
" .combined.str.replace(\"Title: \", \"\")\n",
|
" .combined.str.replace(\"Title: \", \"\")\n",
|
||||||
" .str.replace(\"; Content:\", \": \")\n",
|
" .str.replace(\"; Content:\", \": \")\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" if pprint:\n",
|
" if pprint:\n",
|
||||||
" for r in res:\n",
|
" for r in results:\n",
|
||||||
" print(r[:200])\n",
|
" print(r[:200])\n",
|
||||||
" print()\n",
|
" print()\n",
|
||||||
" return res\n",
|
" return results\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"res = search_reviews(df, \"delicious beans\", n=3)\n"
|
"results = search_reviews(df, \"delicious beans\", n=3)\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -96,7 +95,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"res = search_reviews(df, \"whole wheat pasta\", n=3)"
|
"results = search_reviews(df, \"whole wheat pasta\", n=3)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -121,7 +120,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"res = search_reviews(df, \"bad delivery\", n=1)"
|
"results = search_reviews(df, \"bad delivery\", n=1)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -146,7 +145,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"res = search_reviews(df, \"spoilt\", n=1)"
|
"results = search_reviews(df, \"spoilt\", n=1)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -166,15 +165,15 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"res = search_reviews(df, \"pet food\", n=2)"
|
"results = search_reviews(df, \"pet food\", n=2)"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "openai-cookbook",
|
"display_name": "openai",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "openai-cookbook"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
"language_info": {
|
"language_info": {
|
||||||
"codemirror_mode": {
|
"codemirror_mode": {
|
||||||
@ -186,12 +185,12 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.6"
|
"version": "3.9.9"
|
||||||
},
|
},
|
||||||
"orig_nbformat": 4,
|
"orig_nbformat": 4,
|
||||||
"vscode": {
|
"vscode": {
|
||||||
"interpreter": {
|
"interpreter": {
|
||||||
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
|
"hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
File diff suppressed because one or more lines are too long
2001
examples/data/AG_news_samples.csv
Normal file
2001
examples/data/AG_news_samples.csv
Normal file
File diff suppressed because it is too large
Load Diff
1001
examples/data/fine_food_reviews_with_embeddings_1k.csv
Normal file
1001
examples/data/fine_food_reviews_with_embeddings_1k.csv
Normal file
File diff suppressed because one or more lines are too long
4311
examples/data/library_transactions_with_embeddings_359.csv
Normal file
4311
examples/data/library_transactions_with_embeddings_359.csv
Normal file
File diff suppressed because one or more lines are too long
BIN
examples/data/recommendations_embeddings_cache.pkl
Normal file
BIN
examples/data/recommendations_embeddings_cache.pkl
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user