mirror of
https://github.com/james-m-jordan/openai-cookbook.git
synced 2025-05-09 19:32:38 +00:00
This commit is contained in:
parent
bd58636a8a
commit
988139d70e
@ -235,10 +235,10 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from openai.embeddings_utils import get_embedding\n",
|
"from utils.embeddings_utils import get_embedding\n",
|
||||||
"\n",
|
"\n",
|
||||||
"df = pd.DataFrame(all_funcs)\n",
|
"df = pd.DataFrame(all_funcs)\n",
|
||||||
"df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))\n",
|
"df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))\n",
|
||||||
"df['filepath'] = df['filepath'].map(lambda x: Path(x).relative_to(code_root))\n",
|
"df['filepath'] = df['filepath'].map(lambda x: Path(x).relative_to(code_root))\n",
|
||||||
"df.to_csv(\"data/code_search_openai-python.csv\", index=False)\n",
|
"df.to_csv(\"data/code_search_openai-python.csv\", index=False)\n",
|
||||||
"df.head()"
|
"df.head()"
|
||||||
@ -266,10 +266,10 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from openai.embeddings_utils import cosine_similarity\n",
|
"from utils.embeddings_utils import cosine_similarity\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def search_functions(df, code_query, n=3, pprint=True, n_lines=7):\n",
|
"def search_functions(df, code_query, n=3, pprint=True, n_lines=7):\n",
|
||||||
" embedding = get_embedding(code_query, engine='text-embedding-ada-002')\n",
|
" embedding = get_embedding(code_query, model='text-embedding-ada-002')\n",
|
||||||
" df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))\n",
|
" df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))\n",
|
||||||
"\n",
|
"\n",
|
||||||
" res = df.sort_values('similarities', ascending=False).head(n)\n",
|
" res = df.sort_values('similarities', ascending=False).head(n)\n",
|
||||||
|
@ -51,7 +51,7 @@
|
|||||||
"from sklearn.model_selection import train_test_split # for splitting train & test data\n",
|
"from sklearn.model_selection import train_test_split # for splitting train & test data\n",
|
||||||
"import torch # for matrix optimization\n",
|
"import torch # for matrix optimization\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from openai.embeddings_utils import get_embedding, cosine_similarity # for embeddings\n"
|
"from utils.embeddings_utils import get_embedding, cosine_similarity # for embeddings\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -193,7 +193,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"def request_completion(prompt):\n",
|
"def request_completion(prompt):\n",
|
||||||
"\n",
|
"\n",
|
||||||
" completion_response = openai.Completion.create(\n",
|
" completion_response = openai.completions.create(\n",
|
||||||
" prompt=prompt,\n",
|
" prompt=prompt,\n",
|
||||||
" temperature=0,\n",
|
" temperature=0,\n",
|
||||||
" max_tokens=5,\n",
|
" max_tokens=5,\n",
|
||||||
@ -211,7 +211,7 @@
|
|||||||
" prompt = prompt.replace('DESCRIPTION_TEXT',transaction['Description'])\n",
|
" prompt = prompt.replace('DESCRIPTION_TEXT',transaction['Description'])\n",
|
||||||
" prompt = prompt.replace('TRANSACTION_VALUE',str(transaction['Transaction value (£)']))\n",
|
" prompt = prompt.replace('TRANSACTION_VALUE',str(transaction['Transaction value (£)']))\n",
|
||||||
"\n",
|
"\n",
|
||||||
" classification = request_completion(prompt)['choices'][0]['text'].replace('\\n','')\n",
|
" classification = request_completion(prompt).choices[0].text.replace('\\n','')\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return classification\n",
|
" return classification\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -304,7 +304,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"# Use our completion function to return a prediction\n",
|
"# Use our completion function to return a prediction\n",
|
||||||
"completion_response = request_completion(prompt)\n",
|
"completion_response = request_completion(prompt)\n",
|
||||||
"print(completion_response['choices'][0]['text'])\n"
|
"print(completion_response.choices[0].text)\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -916,8 +916,8 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"from utils.embeddings_utils import get_embedding\n",
|
"from utils.embeddings_utils import get_embedding\n",
|
||||||
"\n",
|
"\n",
|
||||||
"df['babbage_similarity'] = df.combined.apply(lambda x: get_embedding(x, engine='text-similarity-babbage-001'))\n",
|
"df['babbage_similarity'] = df.combined.apply(lambda x: get_embedding(x, model='text-similarity-babbage-001'))\n",
|
||||||
"df['babbage_search'] = df.combined.apply(lambda x: get_embedding(x, engine='text-search-babbage-doc-001'))\n",
|
"df['babbage_search'] = df.combined.apply(lambda x: get_embedding(x, model='text-search-babbage-doc-001'))\n",
|
||||||
"df.to_csv(embedding_path)\n"
|
"df.to_csv(embedding_path)\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -2203,7 +2203,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.8"
|
"version": "3.11.3"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -59,7 +59,7 @@
|
|||||||
"def search_reviews(df, product_description, n=3, pprint=True):\n",
|
"def search_reviews(df, product_description, n=3, pprint=True):\n",
|
||||||
" product_embedding = get_embedding(\n",
|
" product_embedding = get_embedding(\n",
|
||||||
" product_description,\n",
|
" product_description,\n",
|
||||||
" engine=\"text-embedding-ada-002\"\n",
|
" model=\"text-embedding-ada-002\"\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" df[\"similarity\"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))\n",
|
" df[\"similarity\"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -138,7 +138,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"from utils.embeddings_utils import get_embeddings\n",
|
"from utils.embeddings_utils import get_embeddings\n",
|
||||||
"# NOTE: The following code will send a query of batch size 200 to /embeddings\n",
|
"# NOTE: The following code will send a query of batch size 200 to /embeddings\n",
|
||||||
"matrix = get_embeddings(samples[\"text\"].to_list(), engine=\"text-embedding-ada-002\")\n"
|
"matrix = get_embeddings(samples[\"text\"].to_list(), model=\"text-embedding-ada-002\")\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -93,7 +93,7 @@
|
|||||||
" labels = ['negative', 'positive'],\n",
|
" labels = ['negative', 'positive'],\n",
|
||||||
" model = EMBEDDING_MODEL,\n",
|
" model = EMBEDDING_MODEL,\n",
|
||||||
"):\n",
|
"):\n",
|
||||||
" label_embeddings = [get_embedding(label, engine=model) for label in labels]\n",
|
" label_embeddings = [get_embedding(label, model=model) for label in labels]\n",
|
||||||
"\n",
|
"\n",
|
||||||
" def label_score(review_embedding, label_embeddings):\n",
|
" def label_score(review_embedding, label_embeddings):\n",
|
||||||
" return cosine_similarity(review_embedding, label_embeddings[1]) - cosine_similarity(review_embedding, label_embeddings[0])\n",
|
" return cosine_similarity(review_embedding, label_embeddings[1]) - cosine_similarity(review_embedding, label_embeddings[0])\n",
|
||||||
|
@ -15,51 +15,53 @@ import pandas as pd
|
|||||||
|
|
||||||
|
|
||||||
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
|
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
|
||||||
def get_embedding(text: str, engine="text-similarity-davinci-001", **kwargs) -> List[float]:
|
def get_embedding(text: str, model="text-similarity-davinci-001", **kwargs) -> List[float]:
|
||||||
|
|
||||||
# replace newlines, which can negatively affect performance.
|
# replace newlines, which can negatively affect performance.
|
||||||
text = text.replace("\n", " ")
|
text = text.replace("\n", " ")
|
||||||
|
|
||||||
return openai.Embedding.create(input=[text], engine=engine, **kwargs)["data"][0]["embedding"]
|
response = openai.embeddings.create(input=[text], model=model, **kwargs)
|
||||||
|
|
||||||
|
return response.data[0].embedding
|
||||||
|
|
||||||
|
|
||||||
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
|
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
|
||||||
async def aget_embedding(
|
async def aget_embedding(
|
||||||
text: str, engine="text-similarity-davinci-001", **kwargs
|
text: str, model="text-similarity-davinci-001", **kwargs
|
||||||
) -> List[float]:
|
) -> List[float]:
|
||||||
|
|
||||||
# replace newlines, which can negatively affect performance.
|
# replace newlines, which can negatively affect performance.
|
||||||
text = text.replace("\n", " ")
|
text = text.replace("\n", " ")
|
||||||
|
|
||||||
return (await openai.Embedding.acreate(input=[text], engine=engine, **kwargs))["data"][0][
|
return (await openai.embeddings.create(input=[text], model=model, **kwargs))["data"][0][
|
||||||
"embedding"
|
"embedding"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
|
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
|
||||||
def get_embeddings(
|
def get_embeddings(
|
||||||
list_of_text: List[str], engine="text-similarity-babbage-001", **kwargs
|
list_of_text: List[str], model="text-similarity-babbage-001", **kwargs
|
||||||
) -> List[List[float]]:
|
) -> List[List[float]]:
|
||||||
assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."
|
assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."
|
||||||
|
|
||||||
# replace newlines, which can negatively affect performance.
|
# replace newlines, which can negatively affect performance.
|
||||||
list_of_text = [text.replace("\n", " ") for text in list_of_text]
|
list_of_text = [text.replace("\n", " ") for text in list_of_text]
|
||||||
|
|
||||||
data = openai.Embedding.create(input=list_of_text, engine=engine, **kwargs).data
|
data = openai.embeddings.create(input=list_of_text, model=model, **kwargs).data
|
||||||
return [d["embedding"] for d in data]
|
return [d.embedding for d in data]
|
||||||
|
|
||||||
|
|
||||||
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
|
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
|
||||||
async def aget_embeddings(
|
async def aget_embeddings(
|
||||||
list_of_text: List[str], engine="text-similarity-babbage-001", **kwargs
|
list_of_text: List[str], model="text-similarity-babbage-001", **kwargs
|
||||||
) -> List[List[float]]:
|
) -> List[List[float]]:
|
||||||
assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."
|
assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."
|
||||||
|
|
||||||
# replace newlines, which can negatively affect performance.
|
# replace newlines, which can negatively affect performance.
|
||||||
list_of_text = [text.replace("\n", " ") for text in list_of_text]
|
list_of_text = [text.replace("\n", " ") for text in list_of_text]
|
||||||
|
|
||||||
data = (await openai.Embedding.acreate(input=list_of_text, engine=engine, **kwargs)).data
|
data = (await openai.embeddings.create(input=list_of_text, model=model, **kwargs)).data
|
||||||
return [d["embedding"] for d in data]
|
return [d.embedding for d in data]
|
||||||
|
|
||||||
|
|
||||||
def cosine_similarity(a, b):
|
def cosine_similarity(a, b):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user