mirror of
https://github.com/james-m-jordan/openai-cookbook.git
synced 2025-05-09 19:32:38 +00:00
This commit is contained in:
parent
6390b8c66e
commit
786d0a0b9b
@ -22,16 +22,90 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>ProductId</th>\n",
|
||||||
|
" <th>UserId</th>\n",
|
||||||
|
" <th>Score</th>\n",
|
||||||
|
" <th>Summary</th>\n",
|
||||||
|
" <th>Text</th>\n",
|
||||||
|
" <th>combined</th>\n",
|
||||||
|
" <th>n_tokens</th>\n",
|
||||||
|
" <th>embedding</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>B003XPF9BO</td>\n",
|
||||||
|
" <td>A3R7JR3FMEBXQB</td>\n",
|
||||||
|
" <td>5</td>\n",
|
||||||
|
" <td>where does one start...and stop... with a tre...</td>\n",
|
||||||
|
" <td>Wanted to save some to bring to my Chicago fam...</td>\n",
|
||||||
|
" <td>Title: where does one start...and stop... wit...</td>\n",
|
||||||
|
" <td>52</td>\n",
|
||||||
|
" <td>[0.007018072064965963, -0.02731654793024063, 0...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>297</th>\n",
|
||||||
|
" <td>B003VXHGPK</td>\n",
|
||||||
|
" <td>A21VWSCGW7UUAR</td>\n",
|
||||||
|
" <td>4</td>\n",
|
||||||
|
" <td>Good, but not Wolfgang Puck good</td>\n",
|
||||||
|
" <td>Honestly, I have to admit that I expected a li...</td>\n",
|
||||||
|
" <td>Title: Good, but not Wolfgang Puck good; Conte...</td>\n",
|
||||||
|
" <td>178</td>\n",
|
||||||
|
" <td>[-0.003140551969408989, -0.009995664469897747,...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"(24502, 19035)"
|
" ProductId UserId Score \n",
|
||||||
|
"0 B003XPF9BO A3R7JR3FMEBXQB 5 \\\n",
|
||||||
|
"297 B003VXHGPK A21VWSCGW7UUAR 4 \n",
|
||||||
|
"\n",
|
||||||
|
" Summary \n",
|
||||||
|
"0 where does one start...and stop... with a tre... \\\n",
|
||||||
|
"297 Good, but not Wolfgang Puck good \n",
|
||||||
|
"\n",
|
||||||
|
" Text \n",
|
||||||
|
"0 Wanted to save some to bring to my Chicago fam... \\\n",
|
||||||
|
"297 Honestly, I have to admit that I expected a li... \n",
|
||||||
|
"\n",
|
||||||
|
" combined n_tokens \n",
|
||||||
|
"0 Title: where does one start...and stop... wit... 52 \\\n",
|
||||||
|
"297 Title: Good, but not Wolfgang Puck good; Conte... 178 \n",
|
||||||
|
"\n",
|
||||||
|
" embedding \n",
|
||||||
|
"0 [0.007018072064965963, -0.02731654793024063, 0... \n",
|
||||||
|
"297 [-0.003140551969408989, -0.009995664469897747,... "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 2,
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -42,8 +116,28 @@
|
|||||||
"from sklearn.model_selection import train_test_split\n",
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
"from ast import literal_eval\n",
|
"from ast import literal_eval\n",
|
||||||
"\n",
|
"\n",
|
||||||
"df = pd.read_csv('output/embedded_babbage_similarity_50k.csv', index_col=0) # note that you will need to generate this file to run the code below\n",
|
"df = pd.read_csv('data/fine_food_reviews_with_embeddings_1k.csv', index_col=0) # note that you will need to generate this file to run the code below\n",
|
||||||
"df['babbage_similarity'] = df.babbage_similarity.apply(literal_eval).apply(np.array)\n",
|
"df.head(2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(577, 706)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df['babbage_similarity'] = df[\"embedding\"].apply(literal_eval).apply(np.array)\n",
|
||||||
"X_train, X_test, y_train, y_test = train_test_split(df, df.Score, test_size = 0.2, random_state=42)\n",
|
"X_train, X_test, y_train, y_test = train_test_split(df, df.Score, test_size = 0.2, random_state=42)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"user_embeddings = X_train.groupby('UserId').babbage_similarity.apply(np.mean)\n",
|
"user_embeddings = X_train.groupby('UserId').babbage_similarity.apply(np.mean)\n",
|
||||||
@ -71,7 +165,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 8,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -105,7 +199,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 18,
|
"execution_count": 10,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user