mirror of
https://github.com/james-m-jordan/openai-cookbook.git
synced 2025-05-09 19:32:38 +00:00
File name sanitization (#630)
* File name sanitization URL containing reserved characters blocks file name creation. * Regular Expression fix for Sanitized URL Co-authored-by: Simón Fishman <simonpfish@gmail.com> --------- Co-authored-by: Simón Fishman <simonpfish@gmail.com>
This commit is contained in:
parent
4d330b82d7
commit
169f5e02c8
@ -627,9 +627,12 @@
|
||||
" # Get the next URL from the queue\n",
|
||||
" url = queue.pop()\n",
|
||||
" print(url) # for debugging and to see the progress\n",
|
||||
" \n",
|
||||
" sanitized_url = re.sub(r'[^\w\s.-]', '_', url)\n",
|
||||
" file_path = 'text/' + local_domain + '/' + sanitized_url[8:] + \".txt\"\n",
|
||||
"\n",
|
||||
" # Save text from the url to a <url>.txt file\n",
|
||||
" with open('text/'+local_domain+'/'+url[8:].replace(\"/\", \"_\") + \".txt\", \"w\") as f:\n",
|
||||
" with open(file_path, \"w\") as f:\n",
|
||||
"\n",
|
||||
" # Get the text from the URL using BeautifulSoup\n",
|
||||
" soup = BeautifulSoup(requests.get(url).text, \"html.parser\")\n",
|
||||
|
Loading…
x
Reference in New Issue
Block a user