diff --git a/apps/web-crawl-q-and-a/web-qa.ipynb b/apps/web-crawl-q-and-a/web-qa.ipynb index ebe39af..02e42fd 100644 --- a/apps/web-crawl-q-and-a/web-qa.ipynb +++ b/apps/web-crawl-q-and-a/web-qa.ipynb @@ -627,9 +627,12 @@ " # Get the next URL from the queue\n", " url = queue.pop()\n", " print(url) # for debugging and to see the progress\n", + " \n", + " sanitized_url = re.sub(r'[^\w\s.-]', '_', url)\n", + " file_path = 'text/' + local_domain + '/' + sanitized_url[8:] + \".txt\"\n", "\n", " # Save text from the url to a .txt file\n", - " with open('text/'+local_domain+'/'+url[8:].replace(\"/\", \"_\") + \".txt\", \"w\") as f:\n", + " with open(file_path, \"w\") as f:\n", "\n", " # Get the text from the URL using BeautifulSoup\n", " soup = BeautifulSoup(requests.get(url).text, \"html.parser\")\n",