Fix for issue #858: User_and_product_embeddings.ipynb points to incorrect CSV (#863)

2025-05-09 19:32:38 +00:00 · 2023-11-27 13:41:04 -08:00 · 2023-11-27 13:41:04 -08:00 · 786d0a0b9b
commit 786d0a0b9b
parent 6390b8c66e
1 changed files with 101 additions and 7 deletions
--- a/examples/User_and_product_embeddings.ipynb
+++ b/examples/User_and_product_embeddings.ipynb
@ -22,16 +22,90 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ProductId</th>\n",
+       "      <th>UserId</th>\n",
+       "      <th>Score</th>\n",
+       "      <th>Summary</th>\n",
+       "      <th>Text</th>\n",
+       "      <th>combined</th>\n",
+       "      <th>n_tokens</th>\n",
+       "      <th>embedding</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>B003XPF9BO</td>\n",
+       "      <td>A3R7JR3FMEBXQB</td>\n",
+       "      <td>5</td>\n",
+       "      <td>where does one  start...and stop... with a tre...</td>\n",
+       "      <td>Wanted to save some to bring to my Chicago fam...</td>\n",
+       "      <td>Title: where does one  start...and stop... wit...</td>\n",
+       "      <td>52</td>\n",
+       "      <td>[0.007018072064965963, -0.02731654793024063, 0...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>297</th>\n",
+       "      <td>B003VXHGPK</td>\n",
+       "      <td>A21VWSCGW7UUAR</td>\n",
+       "      <td>4</td>\n",
+       "      <td>Good, but not Wolfgang Puck good</td>\n",
+       "      <td>Honestly, I have to admit that I expected a li...</td>\n",
+       "      <td>Title: Good, but not Wolfgang Puck good; Conte...</td>\n",
+       "      <td>178</td>\n",
+       "      <td>[-0.003140551969408989, -0.009995664469897747,...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
      "text/plain": [
-       "(24502, 19035)"
+       "      ProductId          UserId  Score   \n",
+       "0    B003XPF9BO  A3R7JR3FMEBXQB      5  \\\n",
+       "297  B003VXHGPK  A21VWSCGW7UUAR      4   \n",
+       "\n",
+       "                                               Summary   \n",
+       "0    where does one  start...and stop... with a tre...  \\\n",
+       "297                   Good, but not Wolfgang Puck good   \n",
+       "\n",
+       "                                                  Text   \n",
+       "0    Wanted to save some to bring to my Chicago fam...  \\\n",
+       "297  Honestly, I have to admit that I expected a li...   \n",
+       "\n",
+       "                                              combined  n_tokens   \n",
+       "0    Title: where does one  start...and stop... wit...        52  \\\n",
+       "297  Title: Good, but not Wolfgang Puck good; Conte...       178   \n",
+       "\n",
+       "                                             embedding  \n",
+       "0    [0.007018072064965963, -0.02731654793024063, 0...  \n",
+       "297  [-0.003140551969408989, -0.009995664469897747,...  "
      ]
     },
-     "execution_count": 2,
+     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -42,8 +116,28 @@
    "from sklearn.model_selection import train_test_split\n",
    "from ast import literal_eval\n",
    "\n",
-    "df = pd.read_csv('output/embedded_babbage_similarity_50k.csv', index_col=0)  # note that you will need to generate this file to run the code below\n",
-    "df['babbage_similarity'] = df.babbage_similarity.apply(literal_eval).apply(np.array)\n",
+    "df = pd.read_csv('data/fine_food_reviews_with_embeddings_1k.csv', index_col=0)  # note that you will need to generate this file to run the code below\n",
+    "df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(577, 706)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['babbage_similarity'] = df[\"embedding\"].apply(literal_eval).apply(np.array)\n",
    "X_train, X_test, y_train, y_test = train_test_split(df, df.Score, test_size = 0.2, random_state=42)\n",
    "\n",
    "user_embeddings = X_train.groupby('UserId').babbage_similarity.apply(np.mean)\n",
@ -71,7 +165,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@ -105,7 +199,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {