From 169f5e02c8ab13372bb066263424f9ddb31f7f9f Mon Sep 17 00:00:00 2001 From: Safa Asgar <70315479+SaFaUU@users.noreply.github.com> Date: Tue, 29 Aug 2023 23:49:23 +0600 Subject: [PATCH] File name sanitization (#630) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * File name sanitization URL containing reserved characters blocks file name creation. * Regular Expression fix for Sanitized URL Co-authored-by: Simón Fishman --------- Co-authored-by: Simón Fishman --- apps/web-crawl-q-and-a/web-qa.ipynb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/web-crawl-q-and-a/web-qa.ipynb b/apps/web-crawl-q-and-a/web-qa.ipynb index ebe39af..02e42fd 100644 --- a/apps/web-crawl-q-and-a/web-qa.ipynb +++ b/apps/web-crawl-q-and-a/web-qa.ipynb @@ -627,9 +627,12 @@ " # Get the next URL from the queue\n", " url = queue.pop()\n", " print(url) # for debugging and to see the progress\n", + " \n", + " sanitized_url = re.sub(r'[^\w\s.-]', '_', url)\n", + " file_path = 'text/' + local_domain + '/' + sanitized_url[8:] + \".txt\"\n", "\n", " # Save text from the url to a .txt file\n", - " with open('text/'+local_domain+'/'+url[8:].replace(\"/\", \"_\") + \".txt\", \"w\") as f:\n", + " with open(file_path, \"w\") as f:\n", "\n", " # Get the text from the URL using BeautifulSoup\n", " soup = BeautifulSoup(requests.get(url).text, \"html.parser\")\n",