adds data for Obtain_dataset.ipynb

This commit is contained in:
Ted Sanders 2022-07-11 15:28:57 -07:00
parent 25d641d21f
commit 2515ddc7b8
2 changed files with 1039 additions and 43 deletions

View File

@ -45,57 +45,48 @@
" <th>Text</th>\n",
" <th>combined</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1303862400</td>\n",
" <td>B001E4KFG0</td>\n",
" <td>A3SGXH7AUHU8GW</td>\n",
" <th>0</th>\n",
" <td>1351123200</td>\n",
" <td>B003XPF9BO</td>\n",
" <td>A3R7JR3FMEBXQB</td>\n",
" <td>5</td>\n",
" <td>Good Quality Dog Food</td>\n",
" <td>I have bought several of the Vitality canned d...</td>\n",
" <td>Title: Good Quality Dog Food; Content: I have ...</td>\n",
" <td>where does one start...and stop... with a tre...</td>\n",
" <td>Wanted to save some to bring to my Chicago fam...</td>\n",
" <td>Title: where does one start...and stop... wit...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1346976000</td>\n",
" <td>B00813GRG4</td>\n",
" <td>A1D87F6ZCVE5NK</td>\n",
" <th>1</th>\n",
" <td>1351123200</td>\n",
" <td>B003JK537S</td>\n",
" <td>A3JBPC3WFUT5ZP</td>\n",
" <td>1</td>\n",
" <td>Not as Advertised</td>\n",
" <td>Product arrived labeled as Jumbo Salted Peanut...</td>\n",
" <td>Title: Not as Advertised; Content: Product arr...</td>\n",
" <td>Arrived in pieces</td>\n",
" <td>Not pleased at all. When I opened the box, mos...</td>\n",
" <td>Title: Arrived in pieces; Content: Not pleased...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Time ProductId UserId Score Summary \\\n",
"Id \n",
"1 1303862400 B001E4KFG0 A3SGXH7AUHU8GW 5 Good Quality Dog Food \n",
"2 1346976000 B00813GRG4 A1D87F6ZCVE5NK 1 Not as Advertised \n",
" Time ProductId UserId Score \\\n",
"0 1351123200 B003XPF9BO A3R7JR3FMEBXQB 5 \n",
"1 1351123200 B003JK537S A3JBPC3WFUT5ZP 1 \n",
"\n",
" Summary \\\n",
"0 where does one start...and stop... with a tre... \n",
"1 Arrived in pieces \n",
"\n",
" Text \\\n",
"Id \n",
"1 I have bought several of the Vitality canned d... \n",
"2 Product arrived labeled as Jumbo Salted Peanut... \n",
"0 Wanted to save some to bring to my Chicago fam... \n",
"1 Not pleased at all. When I opened the box, mos... \n",
"\n",
" combined \n",
"Id \n",
"1 Title: Good Quality Dog Food; Content: I have ... \n",
"2 Title: Not as Advertised; Content: Product arr... "
"0 Title: where does one start...and stop... wit... \n",
"1 Title: Arrived in pieces; Content: Not pleased... "
]
},
"execution_count": 1,
@ -106,7 +97,8 @@
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv('input/Reviews.csv', index_col=0)\n",
"input_datapath = 'data/fine_food_reviews_1k.csv' # to save space, we provide a pre-filtered dataset\n",
"df = pd.read_csv(input_datapath, index_col=0)\n",
"df = df[['Time', 'ProductId', 'UserId', 'Score', 'Summary', 'Text']]\n",
"df = df.dropna()\n",
"df['combined'] = \"Title: \" + df.Summary.str.strip() + \"; Content: \" + df.Text.str.strip()\n",
@ -161,16 +153,14 @@
"# This will take just under 10 minutes\n",
"df['babbage_similarity'] = df.combined.apply(lambda x: get_embedding(x, engine='text-similarity-babbage-001'))\n",
"df['babbage_search'] = df.combined.apply(lambda x: get_embedding(x, engine='text-search-babbage-doc-001'))\n",
"df.to_csv('output/embedded_1k_reviews.csv')"
"df.to_csv('data/fine_food_reviews_with_embeddings_1k.csv')"
]
}
],
"metadata": {
"interpreter": {
"hash": "be4b5d5b73a21c599de40d6deb1129796d12dc1cc33a738f7bac13269cfcafe8"
},
"kernelspec": {
"display_name": "Python 3.7.3 64-bit ('base': conda)",
"display_name": "Python 3.9.9 ('openai')",
"language": "python",
"name": "python3"
},
"language_info": {
@ -185,7 +175,12 @@
"pygments_lexer": "ipython3",
"version": "3.9.9"
},
"orig_nbformat": 4
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97"
}
}
},
"nbformat": 4,
"nbformat_minor": 2

File diff suppressed because it is too large Load Diff