From b2ca4d395c4859b2356f2745d241c278affd82b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sim=C3=B3n=20Fishman?= Date: Tue, 29 Aug 2023 17:45:47 -0700 Subject: [PATCH] Revert "File name sanitization (#630)" (#668) This reverts commit 169f5e02c8ab13372bb066263424f9ddb31f7f9f. --- apps/web-crawl-q-and-a/web-qa.ipynb | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/apps/web-crawl-q-and-a/web-qa.ipynb b/apps/web-crawl-q-and-a/web-qa.ipynb index 02e42fd..ebe39af 100644 --- a/apps/web-crawl-q-and-a/web-qa.ipynb +++ b/apps/web-crawl-q-and-a/web-qa.ipynb @@ -627,12 +627,9 @@ " # Get the next URL from the queue\n", " url = queue.pop()\n", " print(url) # for debugging and to see the progress\n", - " \n", - " sanitized_url = re.sub(r'[^\w\s.-]', '_', url)\n", - " file_path = 'text/' + local_domain + '/' + sanitized_url[8:] + \".txt\"\n", "\n", " # Save text from the url to a .txt file\n", - " with open(file_path, \"w\") as f:\n", + " with open('text/'+local_domain+'/'+url[8:].replace(\"/\", \"_\") + \".txt\", \"w\") as f:\n", "\n", " # Get the text from the URL using BeautifulSoup\n", " soup = BeautifulSoup(requests.get(url).text, \"html.parser\")\n",