adds data download from CDN with precomputed embeddings

This commit is contained in:
Ted Sanders 2022-07-11 17:02:00 -07:00
parent 6eae26d5cc
commit 350b9a7333

View File

@ -13,14 +13,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Babbage similarity embedding performance on 1k Amazon reviews: mse=0.38, mae=0.39\n" "Babbage similarity embedding performance on 1k Amazon reviews: mse=0.39, mae=0.38\n"
] ]
} }
], ],
@ -32,8 +32,9 @@
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import mean_squared_error, mean_absolute_error\n", "from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
"\n", "\n",
"df = pd.read_csv('output/embedded_1k_reviews.csv')\n", "datafile_path = \"https://cdn.openai.com/API/examples/data/fine_food_reviews_with_embeddings_1k.csv\" # for your convenience, we precomputed the embeddings\n",
"df['babbage_similarity'] = df.babbage_similarity.apply(eval).apply(np.array)\n", "df = pd.read_csv(datafile_path)\n",
"df[\"babbage_similarity\"] = df.babbage_similarity.apply(eval).apply(np.array)\n",
"\n", "\n",
"X_train, X_test, y_train, y_test = train_test_split(list(df.babbage_similarity.values), df.Score, test_size=0.2, random_state=42)\n", "X_train, X_test, y_train, y_test = train_test_split(list(df.babbage_similarity.values), df.Score, test_size=0.2, random_state=42)\n",
"\n", "\n",
@ -41,30 +42,31 @@
"rfr.fit(X_train, y_train)\n", "rfr.fit(X_train, y_train)\n",
"preds = rfr.predict(X_test)\n", "preds = rfr.predict(X_test)\n",
"\n", "\n",
"\n",
"mse = mean_squared_error(y_test, preds)\n", "mse = mean_squared_error(y_test, preds)\n",
"mae = mean_absolute_error(y_test, preds)\n", "mae = mean_absolute_error(y_test, preds)\n",
"\n", "\n",
"print(f\"Babbage similarity embedding performance on 1k Amazon reviews: mse={mse:.2f}, mae={mae:.2f}\")" "print(f\"Babbage similarity embedding performance on 1k Amazon reviews: mse={mse:.2f}, mae={mae:.2f}\")\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 26, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Dummy mean prediction performance on Amazon reviews: mse=1.77, mae=1.04\n" "Dummy mean prediction performance on Amazon reviews: mse=1.81, mae=1.08\n"
] ]
} }
], ],
"source": [ "source": [
"bmse = mean_squared_error(y_test, np.repeat(y_test.mean(), len(y_test)))\n", "bmse = mean_squared_error(y_test, np.repeat(y_test.mean(), len(y_test)))\n",
"bmae = mean_absolute_error(y_test, np.repeat(y_test.mean(), len(y_test)))\n", "bmae = mean_absolute_error(y_test, np.repeat(y_test.mean(), len(y_test)))\n",
"print(f\"Dummy mean prediction performance on Amazon reviews: mse={bmse:.2f}, mae={bmae:.2f}\")" "print(\n",
" f\"Dummy mean prediction performance on Amazon reviews: mse={bmse:.2f}, mae={bmae:.2f}\"\n",
")\n"
] ]
}, },
{ {
@ -83,11 +85,9 @@
} }
], ],
"metadata": { "metadata": {
"interpreter": {
"hash": "be4b5d5b73a21c599de40d6deb1129796d12dc1cc33a738f7bac13269cfcafe8"
},
"kernelspec": { "kernelspec": {
"display_name": "Python 3.7.3 64-bit ('base': conda)", "display_name": "Python 3.9.9 ('openai')",
"language": "python",
"name": "python3" "name": "python3"
}, },
"language_info": { "language_info": {
@ -100,9 +100,14 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.7.3" "version": "3.9.9"
}, },
"orig_nbformat": 4 "orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97"
}
}
}, },
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 2 "nbformat_minor": 2