[tiktoken_counting] fix tokenizer name (#741)

2025-05-09 19:32:38 +00:00 · 2023-09-27 16:12:31 -07:00 · 2023-09-27 16:12:31 -07:00 · c2959fd60b
commit c2959fd60b
parent 4631e1b74a
1 changed files with 58 additions and 10 deletions
--- a/examples/How_to_count_tokens_with_tiktoken.ipynb
+++ b/examples/How_to_count_tokens_with_tiktoken.ipynb
@ -58,6 +58,7 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -68,14 +69,50 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: tiktoken in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (0.5.1)\n",
      "Requirement already satisfied: requests>=2.26.0 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from tiktoken) (2.31.0)\n",
      "Requirement already satisfied: regex>=2022.1.18 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from tiktoken) (2023.8.8)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (2023.7.22)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (3.4)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (3.2.0)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (2.0.5)\n",
      "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.2.1 is available.\n",
      "You should consider upgrading via the '/Users/simon/.virtualenvs/openai/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n",
      "Requirement already satisfied: openai in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (0.28.1)\n",
      "Requirement already satisfied: aiohttp in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from openai) (3.8.5)\n",
      "Requirement already satisfied: requests>=2.20 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from openai) (2.31.0)\n",
      "Requirement already satisfied: tqdm in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from openai) (4.66.1)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (3.4)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (2023.7.22)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (3.2.0)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (2.0.5)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (1.4.0)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (1.3.1)\n",
      "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (4.0.3)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (1.9.2)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (6.0.4)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (23.1.0)\n",
      "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.2.1 is available.\n",
      "You should consider upgrading via the '/Users/simon/.virtualenvs/openai/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
-    "%pip install --upgrade tiktoken"
+    "%pip install --upgrade tiktoken\n",
    "%pip install --upgrade openai"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -88,10 +125,11 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "import tiktoken\n"
+    "import tiktoken"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -112,6 +150,7 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -128,6 +167,7 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -136,6 +176,7 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -163,6 +204,7 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -203,6 +245,7 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -210,6 +253,7 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -237,6 +281,7 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -244,6 +289,7 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -271,6 +317,7 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -278,6 +325,7 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -297,7 +345,7 @@
    "    # print the example string\n",
    "    print(f'\\nExample string: \"{example_string}\"')\n",
    "    # for each encoding, print the # of tokens, the token integers, and the token bytes\n",
-    "    for encoding_name in [\"gpt2\", \"p50k_base\", \"cl100k_base\"]:\n",
+    "    for encoding_name in [\"r50k_base\", \"p50k_base\", \"cl100k_base\"]:\n",
    "        encoding = tiktoken.get_encoding(encoding_name)\n",
    "        token_integers = encoding.encode(example_string)\n",
    "        num_tokens = len(token_integers)\n",
@ -321,7 +369,7 @@
      "\n",
      "Example string: \"antidisestablishmentarianism\"\n",
      "\n",
-      "gpt2: 5 tokens\n",
+      "r50k_base: 5 tokens\n",
      "token integers: [415, 29207, 44390, 3699, 1042]\n",
      "token bytes: [b'ant', b'idis', b'establishment', b'arian', b'ism']\n",
      "\n",
@ -351,7 +399,7 @@
      "\n",
      "Example string: \"2 + 2 = 4\"\n",
      "\n",
-      "gpt2: 5 tokens\n",
+      "r50k_base: 5 tokens\n",
      "token integers: [17, 1343, 362, 796, 604]\n",
      "token bytes: [b'2', b' +', b' 2', b' =', b' 4']\n",
      "\n",
@ -381,7 +429,7 @@
      "\n",
      "Example string: \"お誕生日おめでとう\"\n",
      "\n",
-      "gpt2: 14 tokens\n",
+      "r50k_base: 14 tokens\n",
      "token integers: [2515, 232, 45739, 243, 37955, 33768, 98, 2515, 232, 1792, 223, 30640, 30201, 29557]\n",
      "token bytes: [b'\\xe3\\x81', b'\\x8a', b'\\xe8\\xaa', b'\\x95', b'\\xe7\\x94\\x9f', b'\\xe6\\x97', b'\\xa5', b'\\xe3\\x81', b'\\x8a', b'\\xe3\\x82', b'\\x81', b'\\xe3\\x81\\xa7', b'\\xe3\\x81\\xa8', b'\\xe3\\x81\\x86']\n",
      "\n",
@ -482,7 +530,7 @@
      "gpt-3.5-turbo\n",
      "Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.\n",
      "129 prompt tokens counted by num_tokens_from_messages().\n",
-      "127 prompt tokens counted by the OpenAI API.\n",
+      "129 prompt tokens counted by the OpenAI API.\n",
      "\n",
      "gpt-4-0314\n",
      "129 prompt tokens counted by num_tokens_from_messages().\n",
@ -575,7 +623,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.9.13"
  },
  "vscode": {
   "interpreter": {