mirror of
https://github.com/james-m-jordan/openai-cookbook.git
synced 2025-05-09 19:32:38 +00:00
fixes token counting in translate_latex_book.ipynb (#579)
* fixes token counting in translate_latex_book.ipynb * adds back comment
This commit is contained in:
parent
07c0351216
commit
17858f204f
@ -110,27 +110,37 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def group_chunks(chunks, ntokens, max_len=1000):\n",
|
||||
"def group_chunks(chunks, ntokens, max_len=1000, hard_max_len=3000):\n",
|
||||
" \"\"\"\n",
|
||||
" Group very short chunks, to form approximately a page long chunks.\n",
|
||||
" Group very short chunks, to form approximately page long chunks.\n",
|
||||
" \"\"\"\n",
|
||||
" batches = []\n",
|
||||
" cur_batch = \"\"\n",
|
||||
" cur_tokens = 0\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" # iterate over chunks, and group the short ones together\n",
|
||||
" for chunk, ntoken in zip(chunks, ntokens):\n",
|
||||
" cur_tokens += ntoken + 2 # +2 for the newlines between chunks\n",
|
||||
" # discard chunks that exceed hard max length\n",
|
||||
" if ntoken > hard_max_len:\n",
|
||||
" print(f\"Warning: Chunk discarded for being too long ({ntoken} tokens > {hard_max_len} token limit). Preview: '{chunk[:50]}...'\")\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" # if adding this chunk would exceed the max length, finalize the current batch and start a new one\n",
|
||||
" if ntoken + cur_tokens > max_len:\n",
|
||||
" # if room in current batch, add new chunk\n",
|
||||
" if cur_tokens + 1 + ntoken <= max_len:\n",
|
||||
" cur_batch += \"\\n\\n\" + chunk\n",
|
||||
" cur_tokens += 1 + ntoken # adds 1 token for the two newlines\n",
|
||||
" # otherwise, record the batch and start a new one\n",
|
||||
" else:\n",
|
||||
" batches.append(cur_batch)\n",
|
||||
" cur_batch = chunk\n",
|
||||
" else:\n",
|
||||
" cur_batch += \"\\n\\n\" + chunk\n",
|
||||
" batches.append(cur_batch)\n",
|
||||
" cur_tokens = ntoken\n",
|
||||
" \n",
|
||||
" if cur_batch: # add the last batch if it's not empty\n",
|
||||
" batches.append(cur_batch)\n",
|
||||
" \n",
|
||||
" return batches\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"chunks = group_chunks(chunks, ntokens)\n",
|
||||
"len(chunks)"
|
||||
]
|
||||
|
Loading…
x
Reference in New Issue
Block a user