Update embeddings_utils.py and related notebooks to API V1 (issue #855) (#857)

This commit is contained in:
Gabor Cselle 2023-11-14 13:31:55 -08:00 committed by GitHub
parent bd58636a8a
commit 988139d70e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 27 additions and 25 deletions

View File

@ -235,10 +235,10 @@
} }
], ],
"source": [ "source": [
"from openai.embeddings_utils import get_embedding\n", "from utils.embeddings_utils import get_embedding\n",
"\n", "\n",
"df = pd.DataFrame(all_funcs)\n", "df = pd.DataFrame(all_funcs)\n",
"df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))\n", "df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))\n",
"df['filepath'] = df['filepath'].map(lambda x: Path(x).relative_to(code_root))\n", "df['filepath'] = df['filepath'].map(lambda x: Path(x).relative_to(code_root))\n",
"df.to_csv(\"data/code_search_openai-python.csv\", index=False)\n", "df.to_csv(\"data/code_search_openai-python.csv\", index=False)\n",
"df.head()" "df.head()"
@ -266,10 +266,10 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from openai.embeddings_utils import cosine_similarity\n", "from utils.embeddings_utils import cosine_similarity\n",
"\n", "\n",
"def search_functions(df, code_query, n=3, pprint=True, n_lines=7):\n", "def search_functions(df, code_query, n=3, pprint=True, n_lines=7):\n",
" embedding = get_embedding(code_query, engine='text-embedding-ada-002')\n", " embedding = get_embedding(code_query, model='text-embedding-ada-002')\n",
" df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))\n", " df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))\n",
"\n", "\n",
" res = df.sort_values('similarities', ascending=False).head(n)\n", " res = df.sort_values('similarities', ascending=False).head(n)\n",

View File

@ -51,7 +51,7 @@
"from sklearn.model_selection import train_test_split # for splitting train & test data\n", "from sklearn.model_selection import train_test_split # for splitting train & test data\n",
"import torch # for matrix optimization\n", "import torch # for matrix optimization\n",
"\n", "\n",
"from openai.embeddings_utils import get_embedding, cosine_similarity # for embeddings\n" "from utils.embeddings_utils import get_embedding, cosine_similarity # for embeddings\n"
] ]
}, },
{ {

View File

@ -193,7 +193,7 @@
"source": [ "source": [
"def request_completion(prompt):\n", "def request_completion(prompt):\n",
"\n", "\n",
" completion_response = openai.Completion.create(\n", " completion_response = openai.completions.create(\n",
" prompt=prompt,\n", " prompt=prompt,\n",
" temperature=0,\n", " temperature=0,\n",
" max_tokens=5,\n", " max_tokens=5,\n",
@ -211,7 +211,7 @@
" prompt = prompt.replace('DESCRIPTION_TEXT',transaction['Description'])\n", " prompt = prompt.replace('DESCRIPTION_TEXT',transaction['Description'])\n",
" prompt = prompt.replace('TRANSACTION_VALUE',str(transaction['Transaction value (£)']))\n", " prompt = prompt.replace('TRANSACTION_VALUE',str(transaction['Transaction value (£)']))\n",
"\n", "\n",
" classification = request_completion(prompt)['choices'][0]['text'].replace('\\n','')\n", " classification = request_completion(prompt).choices[0].text.replace('\\n','')\n",
"\n", "\n",
" return classification\n", " return classification\n",
"\n", "\n",
@ -304,7 +304,7 @@
"\n", "\n",
"# Use our completion function to return a prediction\n", "# Use our completion function to return a prediction\n",
"completion_response = request_completion(prompt)\n", "completion_response = request_completion(prompt)\n",
"print(completion_response['choices'][0]['text'])\n" "print(completion_response.choices[0].text)\n"
] ]
}, },
{ {
@ -916,8 +916,8 @@
"source": [ "source": [
"from utils.embeddings_utils import get_embedding\n", "from utils.embeddings_utils import get_embedding\n",
"\n", "\n",
"df['babbage_similarity'] = df.combined.apply(lambda x: get_embedding(x, engine='text-similarity-babbage-001'))\n", "df['babbage_similarity'] = df.combined.apply(lambda x: get_embedding(x, model='text-similarity-babbage-001'))\n",
"df['babbage_search'] = df.combined.apply(lambda x: get_embedding(x, engine='text-search-babbage-doc-001'))\n", "df['babbage_search'] = df.combined.apply(lambda x: get_embedding(x, model='text-search-babbage-doc-001'))\n",
"df.to_csv(embedding_path)\n" "df.to_csv(embedding_path)\n"
] ]
}, },
@ -2203,7 +2203,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.8" "version": "3.11.3"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -59,7 +59,7 @@
"def search_reviews(df, product_description, n=3, pprint=True):\n", "def search_reviews(df, product_description, n=3, pprint=True):\n",
" product_embedding = get_embedding(\n", " product_embedding = get_embedding(\n",
" product_description,\n", " product_description,\n",
" engine=\"text-embedding-ada-002\"\n", " model=\"text-embedding-ada-002\"\n",
" )\n", " )\n",
" df[\"similarity\"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))\n", " df[\"similarity\"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))\n",
"\n", "\n",

View File

@ -138,7 +138,7 @@
"source": [ "source": [
"from utils.embeddings_utils import get_embeddings\n", "from utils.embeddings_utils import get_embeddings\n",
"# NOTE: The following code will send a query of batch size 200 to /embeddings\n", "# NOTE: The following code will send a query of batch size 200 to /embeddings\n",
"matrix = get_embeddings(samples[\"text\"].to_list(), engine=\"text-embedding-ada-002\")\n" "matrix = get_embeddings(samples[\"text\"].to_list(), model=\"text-embedding-ada-002\")\n"
] ]
}, },
{ {

View File

@ -93,7 +93,7 @@
" labels = ['negative', 'positive'],\n", " labels = ['negative', 'positive'],\n",
" model = EMBEDDING_MODEL,\n", " model = EMBEDDING_MODEL,\n",
"):\n", "):\n",
" label_embeddings = [get_embedding(label, engine=model) for label in labels]\n", " label_embeddings = [get_embedding(label, model=model) for label in labels]\n",
"\n", "\n",
" def label_score(review_embedding, label_embeddings):\n", " def label_score(review_embedding, label_embeddings):\n",
" return cosine_similarity(review_embedding, label_embeddings[1]) - cosine_similarity(review_embedding, label_embeddings[0])\n", " return cosine_similarity(review_embedding, label_embeddings[1]) - cosine_similarity(review_embedding, label_embeddings[0])\n",

View File

@ -15,51 +15,53 @@ import pandas as pd
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6)) @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text: str, engine="text-similarity-davinci-001", **kwargs) -> List[float]: def get_embedding(text: str, model="text-similarity-davinci-001", **kwargs) -> List[float]:
# replace newlines, which can negatively affect performance. # replace newlines, which can negatively affect performance.
text = text.replace("\n", " ") text = text.replace("\n", " ")
return openai.Embedding.create(input=[text], engine=engine, **kwargs)["data"][0]["embedding"] response = openai.embeddings.create(input=[text], model=model, **kwargs)
return response.data[0].embedding
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6)) @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
async def aget_embedding( async def aget_embedding(
text: str, engine="text-similarity-davinci-001", **kwargs text: str, model="text-similarity-davinci-001", **kwargs
) -> List[float]: ) -> List[float]:
# replace newlines, which can negatively affect performance. # replace newlines, which can negatively affect performance.
text = text.replace("\n", " ") text = text.replace("\n", " ")
return (await openai.Embedding.acreate(input=[text], engine=engine, **kwargs))["data"][0][ return (await openai.embeddings.create(input=[text], model=model, **kwargs))["data"][0][
"embedding" "embedding"
] ]
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6)) @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embeddings( def get_embeddings(
list_of_text: List[str], engine="text-similarity-babbage-001", **kwargs list_of_text: List[str], model="text-similarity-babbage-001", **kwargs
) -> List[List[float]]: ) -> List[List[float]]:
assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048." assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."
# replace newlines, which can negatively affect performance. # replace newlines, which can negatively affect performance.
list_of_text = [text.replace("\n", " ") for text in list_of_text] list_of_text = [text.replace("\n", " ") for text in list_of_text]
data = openai.Embedding.create(input=list_of_text, engine=engine, **kwargs).data data = openai.embeddings.create(input=list_of_text, model=model, **kwargs).data
return [d["embedding"] for d in data] return [d.embedding for d in data]
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6)) @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
async def aget_embeddings( async def aget_embeddings(
list_of_text: List[str], engine="text-similarity-babbage-001", **kwargs list_of_text: List[str], model="text-similarity-babbage-001", **kwargs
) -> List[List[float]]: ) -> List[List[float]]:
assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048." assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."
# replace newlines, which can negatively affect performance. # replace newlines, which can negatively affect performance.
list_of_text = [text.replace("\n", " ") for text in list_of_text] list_of_text = [text.replace("\n", " ") for text in list_of_text]
data = (await openai.Embedding.acreate(input=list_of_text, engine=engine, **kwargs)).data data = (await openai.embeddings.create(input=list_of_text, model=model, **kwargs)).data
return [d["embedding"] for d in data] return [d.embedding for d in data]
def cosine_similarity(a, b): def cosine_similarity(a, b):