mirror of
https://github.com/james-m-jordan/openai-cookbook.git
synced 2025-05-09 19:32:38 +00:00
[tiktoken_counting] fix tokenizer name (#741)
This commit is contained in:
parent
4631e1b74a
commit
c2959fd60b
@ -58,6 +58,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
@ -68,14 +69,50 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Requirement already satisfied: tiktoken in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (0.5.1)\n",
|
||||||
|
"Requirement already satisfied: requests>=2.26.0 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from tiktoken) (2.31.0)\n",
|
||||||
|
"Requirement already satisfied: regex>=2022.1.18 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from tiktoken) (2023.8.8)\n",
|
||||||
|
"Requirement already satisfied: certifi>=2017.4.17 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (2023.7.22)\n",
|
||||||
|
"Requirement already satisfied: idna<4,>=2.5 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (3.4)\n",
|
||||||
|
"Requirement already satisfied: charset-normalizer<4,>=2 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (3.2.0)\n",
|
||||||
|
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (2.0.5)\n",
|
||||||
|
"\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.2.1 is available.\n",
|
||||||
|
"You should consider upgrading via the '/Users/simon/.virtualenvs/openai/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
|
||||||
|
"\u001b[0mNote: you may need to restart the kernel to use updated packages.\n",
|
||||||
|
"Requirement already satisfied: openai in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (0.28.1)\n",
|
||||||
|
"Requirement already satisfied: aiohttp in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from openai) (3.8.5)\n",
|
||||||
|
"Requirement already satisfied: requests>=2.20 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from openai) (2.31.0)\n",
|
||||||
|
"Requirement already satisfied: tqdm in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from openai) (4.66.1)\n",
|
||||||
|
"Requirement already satisfied: idna<4,>=2.5 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (3.4)\n",
|
||||||
|
"Requirement already satisfied: certifi>=2017.4.17 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (2023.7.22)\n",
|
||||||
|
"Requirement already satisfied: charset-normalizer<4,>=2 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (3.2.0)\n",
|
||||||
|
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (2.0.5)\n",
|
||||||
|
"Requirement already satisfied: frozenlist>=1.1.1 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (1.4.0)\n",
|
||||||
|
"Requirement already satisfied: aiosignal>=1.1.2 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (1.3.1)\n",
|
||||||
|
"Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (4.0.3)\n",
|
||||||
|
"Requirement already satisfied: yarl<2.0,>=1.0 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (1.9.2)\n",
|
||||||
|
"Requirement already satisfied: multidict<7.0,>=4.5 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (6.0.4)\n",
|
||||||
|
"Requirement already satisfied: attrs>=17.3.0 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (23.1.0)\n",
|
||||||
|
"\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.2.1 is available.\n",
|
||||||
|
"You should consider upgrading via the '/Users/simon/.virtualenvs/openai/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
|
||||||
|
"\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"%pip install --upgrade tiktoken"
|
"%pip install --upgrade tiktoken\n",
|
||||||
|
"%pip install --upgrade openai"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
@ -88,10 +125,11 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import tiktoken\n"
|
"import tiktoken"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
@ -112,6 +150,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
@ -128,6 +167,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
@ -136,6 +176,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
@ -163,6 +204,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
@ -203,6 +245,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
@ -210,6 +253,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
@ -237,6 +281,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
@ -244,6 +289,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
@ -271,6 +317,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
@ -278,6 +325,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
@ -297,7 +345,7 @@
|
|||||||
" # print the example string\n",
|
" # print the example string\n",
|
||||||
" print(f'\\nExample string: \"{example_string}\"')\n",
|
" print(f'\\nExample string: \"{example_string}\"')\n",
|
||||||
" # for each encoding, print the # of tokens, the token integers, and the token bytes\n",
|
" # for each encoding, print the # of tokens, the token integers, and the token bytes\n",
|
||||||
" for encoding_name in [\"gpt2\", \"p50k_base\", \"cl100k_base\"]:\n",
|
" for encoding_name in [\"r50k_base\", \"p50k_base\", \"cl100k_base\"]:\n",
|
||||||
" encoding = tiktoken.get_encoding(encoding_name)\n",
|
" encoding = tiktoken.get_encoding(encoding_name)\n",
|
||||||
" token_integers = encoding.encode(example_string)\n",
|
" token_integers = encoding.encode(example_string)\n",
|
||||||
" num_tokens = len(token_integers)\n",
|
" num_tokens = len(token_integers)\n",
|
||||||
@ -321,7 +369,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"Example string: \"antidisestablishmentarianism\"\n",
|
"Example string: \"antidisestablishmentarianism\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"gpt2: 5 tokens\n",
|
"r50k_base: 5 tokens\n",
|
||||||
"token integers: [415, 29207, 44390, 3699, 1042]\n",
|
"token integers: [415, 29207, 44390, 3699, 1042]\n",
|
||||||
"token bytes: [b'ant', b'idis', b'establishment', b'arian', b'ism']\n",
|
"token bytes: [b'ant', b'idis', b'establishment', b'arian', b'ism']\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -351,7 +399,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"Example string: \"2 + 2 = 4\"\n",
|
"Example string: \"2 + 2 = 4\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"gpt2: 5 tokens\n",
|
"r50k_base: 5 tokens\n",
|
||||||
"token integers: [17, 1343, 362, 796, 604]\n",
|
"token integers: [17, 1343, 362, 796, 604]\n",
|
||||||
"token bytes: [b'2', b' +', b' 2', b' =', b' 4']\n",
|
"token bytes: [b'2', b' +', b' 2', b' =', b' 4']\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -381,7 +429,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"Example string: \"お誕生日おめでとう\"\n",
|
"Example string: \"お誕生日おめでとう\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"gpt2: 14 tokens\n",
|
"r50k_base: 14 tokens\n",
|
||||||
"token integers: [2515, 232, 45739, 243, 37955, 33768, 98, 2515, 232, 1792, 223, 30640, 30201, 29557]\n",
|
"token integers: [2515, 232, 45739, 243, 37955, 33768, 98, 2515, 232, 1792, 223, 30640, 30201, 29557]\n",
|
||||||
"token bytes: [b'\\xe3\\x81', b'\\x8a', b'\\xe8\\xaa', b'\\x95', b'\\xe7\\x94\\x9f', b'\\xe6\\x97', b'\\xa5', b'\\xe3\\x81', b'\\x8a', b'\\xe3\\x82', b'\\x81', b'\\xe3\\x81\\xa7', b'\\xe3\\x81\\xa8', b'\\xe3\\x81\\x86']\n",
|
"token bytes: [b'\\xe3\\x81', b'\\x8a', b'\\xe8\\xaa', b'\\x95', b'\\xe7\\x94\\x9f', b'\\xe6\\x97', b'\\xa5', b'\\xe3\\x81', b'\\x8a', b'\\xe3\\x82', b'\\x81', b'\\xe3\\x81\\xa7', b'\\xe3\\x81\\xa8', b'\\xe3\\x81\\x86']\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -482,7 +530,7 @@
|
|||||||
"gpt-3.5-turbo\n",
|
"gpt-3.5-turbo\n",
|
||||||
"Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.\n",
|
"Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.\n",
|
||||||
"129 prompt tokens counted by num_tokens_from_messages().\n",
|
"129 prompt tokens counted by num_tokens_from_messages().\n",
|
||||||
"127 prompt tokens counted by the OpenAI API.\n",
|
"129 prompt tokens counted by the OpenAI API.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"gpt-4-0314\n",
|
"gpt-4-0314\n",
|
||||||
"129 prompt tokens counted by num_tokens_from_messages().\n",
|
"129 prompt tokens counted by num_tokens_from_messages().\n",
|
||||||
@ -575,7 +623,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.7.3"
|
"version": "3.9.13"
|
||||||
},
|
},
|
||||||
"vscode": {
|
"vscode": {
|
||||||
"interpreter": {
|
"interpreter": {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user