From 169f5e02c8ab13372bb066263424f9ddb31f7f9f Mon Sep 17 00:00:00 2001
From: Safa Asgar <70315479+SaFaUU@users.noreply.github.com>
Date: Tue, 29 Aug 2023 23:49:23 +0600
Subject: [PATCH] File name sanitization (#630)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* File name sanitization

URL containing reserved characters blocks file name creation.

* Regular Expression fix for Sanitized URL

Co-authored-by: Simón Fishman <simonpfish@gmail.com>

---------

Co-authored-by: Simón Fishman <simonpfish@gmail.com>
---
 apps/web-crawl-q-and-a/web-qa.ipynb | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/apps/web-crawl-q-and-a/web-qa.ipynb b/apps/web-crawl-q-and-a/web-qa.ipynb
index ebe39af..02e42fd 100644
--- a/apps/web-crawl-q-and-a/web-qa.ipynb
+++ b/apps/web-crawl-q-and-a/web-qa.ipynb
@@ -627,9 +627,12 @@
     "        # Get the next URL from the queue\n",
     "        url = queue.pop()\n",
     "        print(url) # for debugging and to see the progress\n",
+    "        \n",
+    "        sanitized_url = re.sub(r'[^\w\s.-]', '_', url)\n",
+    "        file_path = 'text/' + local_domain + '/' + sanitized_url[8:] + \".txt\"\n",
     "\n",
     "        # Save text from the url to a <url>.txt file\n",
-    "        with open('text/'+local_domain+'/'+url[8:].replace(\"/\", \"_\") + \".txt\", \"w\") as f:\n",
+    "        with open(file_path, \"w\") as f:\n",
     "\n",
     "            # Get the text from the URL using BeautifulSoup\n",
     "            soup = BeautifulSoup(requests.get(url).text, \"html.parser\")\n",