mirror of
https://github.com/james-m-jordan/openai-cookbook.git
synced 2025-05-09 19:32:38 +00:00
This commit is contained in:
parent
6390b8c66e
commit
786d0a0b9b
@ -22,16 +22,90 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>ProductId</th>\n",
|
||||
" <th>UserId</th>\n",
|
||||
" <th>Score</th>\n",
|
||||
" <th>Summary</th>\n",
|
||||
" <th>Text</th>\n",
|
||||
" <th>combined</th>\n",
|
||||
" <th>n_tokens</th>\n",
|
||||
" <th>embedding</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>B003XPF9BO</td>\n",
|
||||
" <td>A3R7JR3FMEBXQB</td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>where does one start...and stop... with a tre...</td>\n",
|
||||
" <td>Wanted to save some to bring to my Chicago fam...</td>\n",
|
||||
" <td>Title: where does one start...and stop... wit...</td>\n",
|
||||
" <td>52</td>\n",
|
||||
" <td>[0.007018072064965963, -0.02731654793024063, 0...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>297</th>\n",
|
||||
" <td>B003VXHGPK</td>\n",
|
||||
" <td>A21VWSCGW7UUAR</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>Good, but not Wolfgang Puck good</td>\n",
|
||||
" <td>Honestly, I have to admit that I expected a li...</td>\n",
|
||||
" <td>Title: Good, but not Wolfgang Puck good; Conte...</td>\n",
|
||||
" <td>178</td>\n",
|
||||
" <td>[-0.003140551969408989, -0.009995664469897747,...</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
"(24502, 19035)"
|
||||
" ProductId UserId Score \n",
|
||||
"0 B003XPF9BO A3R7JR3FMEBXQB 5 \\\n",
|
||||
"297 B003VXHGPK A21VWSCGW7UUAR 4 \n",
|
||||
"\n",
|
||||
" Summary \n",
|
||||
"0 where does one start...and stop... with a tre... \\\n",
|
||||
"297 Good, but not Wolfgang Puck good \n",
|
||||
"\n",
|
||||
" Text \n",
|
||||
"0 Wanted to save some to bring to my Chicago fam... \\\n",
|
||||
"297 Honestly, I have to admit that I expected a li... \n",
|
||||
"\n",
|
||||
" combined n_tokens \n",
|
||||
"0 Title: where does one start...and stop... wit... 52 \\\n",
|
||||
"297 Title: Good, but not Wolfgang Puck good; Conte... 178 \n",
|
||||
"\n",
|
||||
" embedding \n",
|
||||
"0 [0.007018072064965963, -0.02731654793024063, 0... \n",
|
||||
"297 [-0.003140551969408989, -0.009995664469897747,... "
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -42,8 +116,28 @@
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from ast import literal_eval\n",
|
||||
"\n",
|
||||
"df = pd.read_csv('output/embedded_babbage_similarity_50k.csv', index_col=0) # note that you will need to generate this file to run the code below\n",
|
||||
"df['babbage_similarity'] = df.babbage_similarity.apply(literal_eval).apply(np.array)\n",
|
||||
"df = pd.read_csv('data/fine_food_reviews_with_embeddings_1k.csv', index_col=0) # note that you will need to generate this file to run the code below\n",
|
||||
"df.head(2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(577, 706)"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df['babbage_similarity'] = df[\"embedding\"].apply(literal_eval).apply(np.array)\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(df, df.Score, test_size = 0.2, random_state=42)\n",
|
||||
"\n",
|
||||
"user_embeddings = X_train.groupby('UserId').babbage_similarity.apply(np.mean)\n",
|
||||
@ -71,7 +165,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -105,7 +199,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user