diff --git a/apps/web-crawl-q-and-a/web-qa.ipynb b/apps/web-crawl-q-and-a/web-qa.ipynb index cee9dab..ebe39af 100644 --- a/apps/web-crawl-q-and-a/web-qa.ipynb +++ b/apps/web-crawl-q-and-a/web-qa.ipynb @@ -1114,10 +1114,11 @@ "source": [ "import pandas as pd\n", "import numpy as np\n", + "from ast import literal_eval\n", "from openai.embeddings_utils import distances_from_embeddings, cosine_similarity\n", "\n", "df=pd.read_csv('processed/embeddings.csv', index_col=0)\n", - "df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)\n", + "df['embeddings'] = df['embeddings'].apply(literal_eval).apply(np.array)\n", "\n", "df.head()" ] diff --git a/apps/web-crawl-q-and-a/web-qa.py b/apps/web-crawl-q-and-a/web-qa.py index 2be2a60..7355a5d 100644 --- a/apps/web-crawl-q-and-a/web-qa.py +++ b/apps/web-crawl-q-and-a/web-qa.py @@ -15,6 +15,7 @@ import tiktoken import openai import numpy as np from openai.embeddings_utils import distances_from_embeddings, cosine_similarity +from ast import literal_eval # Regex pattern to match a URL HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$' @@ -300,7 +301,7 @@ df.head() ################################################################################ df=pd.read_csv('processed/embeddings.csv', index_col=0) -df['embeddings'] = df['embeddings'].apply(eval).apply(np.array) +df['embeddings'] = df['embeddings'].apply(literal_eval).apply(np.array) df.head() diff --git a/examples/Classification_using_embeddings.ipynb b/examples/Classification_using_embeddings.ipynb index c94d7e4..0b90b02 100644 --- a/examples/Classification_using_embeddings.ipynb +++ b/examples/Classification_using_embeddings.ipynb @@ -40,6 +40,7 @@ "# imports\n", "import pandas as pd\n", "import numpy as np\n", + "from ast import literal_eval\n", "\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.model_selection import train_test_split\n", @@ -49,7 +50,7 @@ "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n", "\n", "df = pd.read_csv(datafile_path)\n", - "df[\"embedding\"] = df.embedding.apply(eval).apply(np.array) # convert string to array\n", + "df[\"embedding\"] = df.embedding.apply(literal_eval).apply(np.array) # convert string to array\n", "\n", "# split data into train and test\n", "X_train, X_test, y_train, y_test = train_test_split(\n", @@ -67,6 +68,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -105,6 +107,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ diff --git a/examples/Clustering.ipynb b/examples/Clustering.ipynb index bf50155..585af3f 100644 --- a/examples/Clustering.ipynb +++ b/examples/Clustering.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -29,17 +30,19 @@ "# imports\n", "import numpy as np\n", "import pandas as pd\n", + "from ast import literal_eval\n", "\n", "# load data\n", "datafile_path = \"./data/fine_food_reviews_with_embeddings_1k.csv\"\n", "\n", "df = pd.read_csv(datafile_path)\n", - "df[\"embedding\"] = df.embedding.apply(eval).apply(np.array) # convert string to numpy array\n", + "df[\"embedding\"] = df.embedding.apply(literal_eval).apply(np.array) # convert string to numpy array\n", "matrix = np.vstack(df.embedding.values)\n", "matrix.shape\n" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -47,6 +50,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -155,6 +159,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -240,6 +245,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ diff --git a/examples/Clustering_for_transaction_classification.ipynb b/examples/Clustering_for_transaction_classification.ipynb index adae1ae..998815d 100644 --- a/examples/Clustering_for_transaction_classification.ipynb +++ b/examples/Clustering_for_transaction_classification.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -12,6 +13,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -33,6 +35,7 @@ "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import os\n", + "from ast import literal_eval\n", "\n", "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n", "COMPLETIONS_MODEL = \"text-davinci-003\"\n", @@ -42,6 +45,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -77,7 +81,7 @@ ], "source": [ "embedding_df = pd.read_csv(embedding_path)\n", - "embedding_df[\"embedding\"] = embedding_df.embedding.apply(eval).apply(np.array)\n", + "embedding_df[\"embedding\"] = embedding_df.embedding.apply(literal_eval).apply(np.array)\n", "matrix = np.vstack(embedding_df.embedding.values)\n", "matrix.shape" ] @@ -264,6 +268,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ diff --git a/examples/Multiclass_classification_for_transactions.ipynb b/examples/Multiclass_classification_for_transactions.ipynb index c3ec610..b8e5754 100644 --- a/examples/Multiclass_classification_for_transactions.ipynb +++ b/examples/Multiclass_classification_for_transactions.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -15,6 +16,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -48,6 +50,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -247,6 +250,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -303,6 +307,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -663,6 +668,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -670,6 +676,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -914,6 +921,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1078,9 +1086,10 @@ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import classification_report, accuracy_score\n", + "from ast import literal_eval\n", "\n", "fs_df = pd.read_csv(embedding_path)\n", - "fs_df[\"babbage_similarity\"] = fs_df.babbage_similarity.apply(eval).apply(np.array)\n", + "fs_df[\"babbage_similarity\"] = fs_df.babbage_similarity.apply(literal_eval).apply(np.array)\n", "fs_df.head()" ] }, @@ -1135,6 +1144,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1144,6 +1154,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1153,6 +1164,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1703,6 +1715,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1832,6 +1845,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -2159,6 +2173,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ diff --git a/examples/Regression_using_embeddings.ipynb b/examples/Regression_using_embeddings.ipynb index 323d6b0..bf691fe 100644 --- a/examples/Regression_using_embeddings.ipynb +++ b/examples/Regression_using_embeddings.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -27,6 +28,7 @@ "source": [ "import pandas as pd\n", "import numpy as np\n", + "from ast import literal_eval\n", "\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.model_selection import train_test_split\n", @@ -35,7 +37,7 @@ "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n", "\n", "df = pd.read_csv(datafile_path)\n", - "df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)\n", + "df[\"embedding\"] = df.embedding.apply(literal_eval).apply(np.array)\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(list(df.embedding.values), df.Score, test_size=0.2, random_state=42)\n", "\n", @@ -79,6 +81,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ diff --git a/examples/Semantic_text_search_using_embeddings.ipynb b/examples/Semantic_text_search_using_embeddings.ipynb index 6155dc9..c1fe81b 100644 --- a/examples/Semantic_text_search_using_embeddings.ipynb +++ b/examples/Semantic_text_search_using_embeddings.ipynb @@ -18,11 +18,12 @@ "source": [ "import pandas as pd\n", "import numpy as np\n", + "from ast import literal_eval\n", "\n", "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n", "\n", "df = pd.read_csv(datafile_path)\n", - "df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)\n" + "df[\"embedding\"] = df.embedding.apply(literal_eval).apply(np.array)\n" ] }, { @@ -101,6 +102,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -126,6 +128,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ diff --git a/examples/User_and_product_embeddings.ipynb b/examples/User_and_product_embeddings.ipynb index 565ccd2..84aa1af 100644 --- a/examples/User_and_product_embeddings.ipynb +++ b/examples/User_and_product_embeddings.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -10,6 +11,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -38,9 +40,10 @@ "import pandas as pd\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", + "from ast import literal_eval\n", "\n", "df = pd.read_csv('output/embedded_babbage_similarity_50k.csv', index_col=0) # note that you will need to generate this file to run the code below\n", - "df['babbage_similarity'] = df.babbage_similarity.apply(eval).apply(np.array)\n", + "df['babbage_similarity'] = df.babbage_similarity.apply(literal_eval).apply(np.array)\n", "X_train, X_test, y_train, y_test = train_test_split(df, df.Score, test_size = 0.2, random_state=42)\n", "\n", "user_embeddings = X_train.groupby('UserId').babbage_similarity.apply(np.mean)\n", @@ -49,6 +52,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -56,6 +60,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -89,6 +94,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -138,6 +144,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ diff --git a/examples/Visualizing_embeddings_in_2D.ipynb b/examples/Visualizing_embeddings_in_2D.ipynb index ea3fefa..5a563e5 100644 --- a/examples/Visualizing_embeddings_in_2D.ipynb +++ b/examples/Visualizing_embeddings_in_2D.ipynb @@ -11,6 +11,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -39,13 +40,14 @@ "import pandas as pd\n", "from sklearn.manifold import TSNE\n", "import numpy as np\n", + "from ast import literal_eval\n", "\n", "# Load the embeddings\n", "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n", "df = pd.read_csv(datafile_path)\n", "\n", "# Convert to a list of lists of floats\n", - "matrix = np.array(df.embedding.apply(eval).to_list())\n", + "matrix = np.array(df.embedding.apply(literal_eval).to_list())\n", "\n", "# Create a t-SNE model and transform the data\n", "tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)\n", @@ -54,6 +56,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ diff --git a/examples/Visualizing_embeddings_in_W&B.ipynb b/examples/Visualizing_embeddings_in_W&B.ipynb index da5bae5..10d09fc 100644 --- a/examples/Visualizing_embeddings_in_W&B.ipynb +++ b/examples/Visualizing_embeddings_in_W&B.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -20,6 +21,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -37,13 +39,14 @@ "import pandas as pd\n", "from sklearn.manifold import TSNE\n", "import numpy as np\n", + "from ast import literal_eval\n", "\n", "# Load the embeddings\n", "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n", "df = pd.read_csv(datafile_path)\n", "\n", "# Convert to a list of lists of floats\n", - "matrix = np.array(df.embedding.apply(eval).to_list())" + "matrix = np.array(df.embedding.apply(literal_eval).to_list())" ] }, { @@ -68,6 +71,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -75,6 +79,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -82,6 +87,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ diff --git a/examples/Visualizing_embeddings_with_Atlas.ipynb b/examples/Visualizing_embeddings_with_Atlas.ipynb index a47590f..7828a45 100644 --- a/examples/Visualizing_embeddings_with_Atlas.ipynb +++ b/examples/Visualizing_embeddings_with_Atlas.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -20,6 +21,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -29,20 +31,19 @@ { "cell_type": "code", "execution_count": 1, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", "output_type": "stream", - "text": [ - ] + "text": [] } ], "source": [ "!pip install nomic" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", @@ -52,13 +53,14 @@ "source": [ "import pandas as pd\n", "import numpy as np\n", + "from ast import literal_eval\n", "\n", "# Load the embeddings\n", "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n", "df = pd.read_csv(datafile_path)\n", "\n", "# Convert to a list of lists of floats\n", - "embeddings = np.array(df.embedding.apply(eval).to_list())\n", + "embeddings = np.array(df.embedding.apply(literal_eval).to_list())\n", "df = df.drop('embedding', axis=1)\n", "df = df.rename(columns={'Unnamed: 0': 'id'})\n" ] @@ -71,8 +73,7 @@ { "name": "stderr", "output_type": "stream", - "text": [ - ] + "text": [] } ], "source": [ @@ -88,6 +89,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -97,11 +99,65 @@ { "cell_type": "code", "execution_count": 10, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { - "text/plain": "meek-laborer: https://atlas.nomic.ai/map/fddc0e07-97c5-477c-827c-96bca44519aa/463f4614-7689-47e4-b55b-1da0cc679559", - "text/html": "\n