mirror of
https://github.com/james-m-jordan/openai-cookbook.git
synced 2025-05-09 19:32:38 +00:00
Ted/embeddings playground (#439)
* adds embeddings-playground app * update table of contents with embeddings playground
This commit is contained in:
parent
32de9fa9ba
commit
28ab8b5c44
@ -10,6 +10,7 @@ Most code examples are written in Python, though the concepts can be applied in
|
||||
|
||||
## Recently added/updated 🆕 ✨
|
||||
|
||||
- [Embeddings playground (streamlit app)](apps/embeddings-playground/README.md) [May 19th, 2023]
|
||||
- [How to use a multi-step prompt to write unit tests](examples/Unit_test_writing_using_a_multi-step_prompt.ipynb) [May 19, 2023]
|
||||
- [How to create dynamic masks with DALL·E and Segment Anything](examples/dalle/How_to_create_dynamic_masks_with_DALL-E_and_Segment_Anything.ipynb) [May 19th, 2023]
|
||||
- [Question answering using embeddings](examples/Question_answering_using_embeddings.ipynb) [Apr 14th, 2023]
|
||||
@ -36,6 +37,7 @@ Most code examples are written in Python, though the concepts can be applied in
|
||||
- [Clustering embeddings](examples/Clustering.ipynb)
|
||||
- [Visualizing embeddings in 2D](examples/Visualizing_embeddings_in_2D.ipynb) or [3D](examples/Visualizing_embeddings_in_3D.ipynb)
|
||||
- [Embedding long texts](examples/Embedding_long_inputs.ipynb)
|
||||
- [Embeddings playground (streamlit app)](apps/embeddings-playground/README.md) [May 19th, 2023]
|
||||
- Apps
|
||||
- [File Q&A](apps/file-q-and-a/)
|
||||
- [Web Crawl Q&A](apps/web-crawl-q-and-a)
|
||||
|
42
apps/embeddings-playground/README.md
Normal file
42
apps/embeddings-playground/README.md
Normal file
@ -0,0 +1,42 @@
|
||||
# Embeddings Playground
|
||||
|
||||
[`embeddings_playground.py`](embeddings_playground.py) is a single-page streamlit app for experimenting with OpenAI embeddings.
|
||||
|
||||
## Installation
|
||||
|
||||
Before running, install required dependencies with:
|
||||
|
||||
`pip install -r examples/apps/embeddings_playground/requirements.txt`
|
||||
|
||||
(You may need to change the path to match your local path.)
|
||||
|
||||
Verify installation of streamlit with `streamlit hello`.
|
||||
|
||||
## Usage
|
||||
|
||||
Run the script with:
|
||||
|
||||
`streamlit run examples/apps/embeddings_playground.py`
|
||||
|
||||
(Again, you may need to change the path to match your local path.)
|
||||
|
||||
In the app, first select your choice of:
|
||||
- distance metric (we recommend cosine)
|
||||
- embedding model (we recommend `text-embedding-ada-002` for most use cases, as of May 2023)
|
||||
|
||||
Then, enter a variable number of strings to compare. Click `rank` to see:
|
||||
- the ranked list of strings, sorted by distance from the first string
|
||||
- a heatmap showing the distance between each pair of strings
|
||||
|
||||
## Example
|
||||
|
||||
Here's an example distance matrix for 8 example strings related to `The sky is blue`:
|
||||
|
||||

|
||||
|
||||
From these distance pairs, you can see:
|
||||
- embeddings measure topical similarity more than logical similarity (e.g., `The sky is blue` is very close to `The sky is not blue`)
|
||||
- punctuation affects embeddings (e.g., `"THE. SKY. IS. BLUE!"` is only third closest to `The sky is blue`)
|
||||
- within-language pairs are stronger than across-language pairs (e.g., `El cielo as azul` is closer to `El cielo es rojo` than to `The sky is blue`)
|
||||
|
||||
Experiment with your own strings to see what you can learn.
|
178
apps/embeddings-playground/embeddings_playground.py
Normal file
178
apps/embeddings-playground/embeddings_playground.py
Normal file
@ -0,0 +1,178 @@
|
||||
"""
|
||||
EMBEDDINGS PLAYGROUND
|
||||
|
||||
This is a single-page streamlit app for experimenting with OpenAI embeddings.
|
||||
|
||||
Before running, install required dependencies with:
|
||||
|
||||
`pip install -r apps/embeddings-playground/requirements.txt`
|
||||
|
||||
You may need to change the path to match your local path.
|
||||
|
||||
Verify installation of streamlit with `streamlit hello`.
|
||||
|
||||
Run this script with:
|
||||
|
||||
`streamlit run apps/embeddings-playground/embeddings_playground.py`
|
||||
|
||||
Again, you may need to change the path to match your local path.
|
||||
"""
|
||||
|
||||
# IMPORTS
|
||||
import altair as alt
|
||||
import openai
|
||||
import os
|
||||
import pandas as pd
|
||||
from scipy import spatial
|
||||
import streamlit as st
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_attempt,
|
||||
wait_random_exponential,
|
||||
)
|
||||
|
||||
# FUNCTIONS
|
||||
|
||||
# get embeddings
|
||||
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
||||
@st.cache_data
|
||||
def embedding_from_string(input: str, model: str) -> list:
|
||||
response = openai.Embedding.create(input=input, model=model)
|
||||
embedding = response["data"][0]["embedding"]
|
||||
return embedding
|
||||
|
||||
|
||||
# plot distance matrix
|
||||
def plot_distance_matrix(strings: list, engine: str, distance: str):
|
||||
# create dataframe of embedding distances
|
||||
df = pd.DataFrame({"string": strings, "index": range(len(strings))})
|
||||
df["embedding"] = df["string"].apply(lambda string: embedding_from_string(string, engine))
|
||||
df["string"] = df.apply(lambda row: f"({row['index'] + 1}) {row['string']}", axis=1)
|
||||
df["dummy_key"] = 0
|
||||
df = pd.merge(df, df, on="dummy_key", suffixes=("_1", "_2")).drop("dummy_key", axis=1)
|
||||
df = df[df["string_1"] != df["string_2"]] # filter out diagonal (always 0)
|
||||
df["distance"] = df.apply(
|
||||
lambda row: distance_metrics[distance](row["embedding_1"], row["embedding_2"]),
|
||||
axis=1,
|
||||
)
|
||||
df["label"] = df["distance"].apply(lambda d: f"{d:.2f}")
|
||||
|
||||
# set chart params
|
||||
text_size = 32
|
||||
label_size = 16
|
||||
pixels_per_string = 80 # aka row height & column width (perpendicular to text)
|
||||
max_label_width = 256 # in pixels, not characters, I think?
|
||||
chart_width = (
|
||||
50
|
||||
+ min(max_label_width, max(df["string_1"].apply(len) * label_size/2))
|
||||
+ len(strings) * pixels_per_string
|
||||
)
|
||||
|
||||
# extract chart parameters from data
|
||||
color_min = df["distance"].min()
|
||||
color_max = 1.5 * df["distance"].max()
|
||||
x_order = df["string_1"].values
|
||||
ranked = False
|
||||
if ranked:
|
||||
ranked_df = df[(df["string_1"] == f"(1) {strings[0]}")].sort_values(by="distance")
|
||||
y_order = ranked_df["string_2"].values
|
||||
else:
|
||||
y_order = x_order
|
||||
|
||||
# create chart
|
||||
boxes = (
|
||||
alt.Chart(df, title=f"{engine}")
|
||||
.mark_rect()
|
||||
.encode(
|
||||
x=alt.X("string_1", title=None, sort=x_order),
|
||||
y=alt.Y("string_2", title=None, sort=y_order),
|
||||
color=alt.Color("distance:Q", title=f"{distance} distance", scale=alt.Scale(domain=[color_min,color_max], scheme="darkblue", reverse=True)),
|
||||
)
|
||||
)
|
||||
|
||||
labels = (
|
||||
boxes.mark_text(align="center", baseline="middle", fontSize=text_size)
|
||||
.encode(text="label")
|
||||
.configure_axis(labelLimit=max_label_width, labelFontSize=label_size)
|
||||
.properties(width=chart_width, height=chart_width)
|
||||
)
|
||||
|
||||
st.altair_chart(labels) # note: layered plots are not supported in streamlit :(
|
||||
|
||||
|
||||
# PAGE
|
||||
|
||||
st.title("OpenAI Embeddings Playground")
|
||||
|
||||
# get API key
|
||||
try:
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY")
|
||||
st.write(f"API key sucessfully retrieved: {openai.api_key[:3]}...{openai.api_key[-4:]}")
|
||||
except:
|
||||
st.header("Enter API Key")
|
||||
openai.api_key = st.text_input("API key")
|
||||
|
||||
# select distance metric
|
||||
st.header("Select distance metric")
|
||||
distance_metrics = {
|
||||
"cosine": spatial.distance.cosine,
|
||||
"L1 (cityblock)": spatial.distance.cityblock,
|
||||
"L2 (euclidean)": spatial.distance.euclidean,
|
||||
"Linf (chebyshev)": spatial.distance.chebyshev,
|
||||
#'correlation': spatial.distance.correlation, # not sure this makes sense for individual vectors - looks like cosine
|
||||
}
|
||||
distance_metric_options = list(distance_metrics.keys())
|
||||
distance = st.radio("Distance metric", distance_metric_options)
|
||||
|
||||
# select models
|
||||
st.header("Select models")
|
||||
models = [
|
||||
"text-embedding-ada-002",
|
||||
"text-similarity-ada-001",
|
||||
"text-similarity-babbage-001",
|
||||
"text-similarity-curie-001",
|
||||
"text-similarity-davinci-001",
|
||||
]
|
||||
prechecked_models = [
|
||||
"text-embedding-ada-002"
|
||||
]
|
||||
model_values = [st.checkbox(model, key=model, value=(model in prechecked_models)) for model in models]
|
||||
|
||||
# enter strings
|
||||
st.header("Enter strings")
|
||||
strings = []
|
||||
if "num_boxes" not in st.session_state:
|
||||
st.session_state.num_boxes = 5
|
||||
if st.session_state.num_boxes > 2:
|
||||
if st.button("Remove last text box"):
|
||||
st.session_state.num_boxes -= 1
|
||||
if st.button("Add new text box"):
|
||||
st.session_state.num_boxes += 1
|
||||
for i in range(st.session_state.num_boxes):
|
||||
string = st.text_input(f"String {i+1}")
|
||||
strings.append(string)
|
||||
|
||||
# rank strings
|
||||
st.header("Rank strings by relatedness")
|
||||
if st.button("Rank"):
|
||||
# display a dataframe comparing rankings to string #1
|
||||
st.subheader("Rankings")
|
||||
ranked_strings = {}
|
||||
for model, value in zip(models, model_values):
|
||||
if value:
|
||||
query_embedding = embedding_from_string(strings[0], model)
|
||||
df = pd.DataFrame({"string": strings})
|
||||
df[model] = df["string"].apply(lambda string: embedding_from_string(string, model))
|
||||
df["distance"] = df[model].apply(
|
||||
lambda embedding: distance_metrics[distance](query_embedding, embedding)
|
||||
)
|
||||
df = df.sort_values(by="distance")
|
||||
ranked_strings[model] = df["string"].values
|
||||
df = pd.DataFrame(ranked_strings)
|
||||
st.dataframe(df)
|
||||
|
||||
# display charts of all the pairwise distances between strings
|
||||
st.subheader("Distance matrices")
|
||||
for model, value in zip(models, model_values):
|
||||
if value:
|
||||
plot_distance_matrix(strings, model, distance)
|
BIN
apps/embeddings-playground/example_distance_matrix.png
Normal file
BIN
apps/embeddings-playground/example_distance_matrix.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 409 KiB |
6
apps/embeddings-playground/requirements.txt
Normal file
6
apps/embeddings-playground/requirements.txt
Normal file
@ -0,0 +1,6 @@
|
||||
altair
|
||||
openai
|
||||
pandas
|
||||
scipy
|
||||
streamlit
|
||||
tenacity
|
Loading…
x
Reference in New Issue
Block a user