diff --git a/apps/web-crawl-q-and-a/web-qa.ipynb b/apps/web-crawl-q-and-a/web-qa.ipynb index 02e42fd..ebe39af 100644 --- a/apps/web-crawl-q-and-a/web-qa.ipynb +++ b/apps/web-crawl-q-and-a/web-qa.ipynb @@ -627,12 +627,9 @@ " # Get the next URL from the queue\n", " url = queue.pop()\n", " print(url) # for debugging and to see the progress\n", - " \n", - " sanitized_url = re.sub(r'[^\w\s.-]', '_', url)\n", - " file_path = 'text/' + local_domain + '/' + sanitized_url[8:] + \".txt\"\n", "\n", " # Save text from the url to a .txt file\n", - " with open(file_path, \"w\") as f:\n", + " with open('text/'+local_domain+'/'+url[8:].replace(\"/\", \"_\") + \".txt\", \"w\") as f:\n", "\n", " # Get the text from the URL using BeautifulSoup\n", " soup = BeautifulSoup(requests.get(url).text, \"html.parser\")\n",