Catch the exception thrown by the With.Open and continue with the queue (#155)

Co-authored-by: Ashok Manghat <amanghat@rmplc.net>
This commit is contained in:
DevilsWorkShop 2023-09-12 04:25:57 +05:30 committed by GitHub
parent 18b7ee5f37
commit 39b62a6c09
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -20,6 +20,9 @@ from ast import literal_eval
# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$'
# Define OpenAI api_key
# openai.api_key = '<Your API Key>'
# Define root domain to crawl
domain = "openai.com"
full_url = "https://openai.com/"
@ -137,6 +140,8 @@ def crawl(url):
url = queue.pop()
print(url) # for debugging and to see the progress
# Try extracting the text from the link, if failed proceed with the next item in the queue
try:
# Save text from the url to a <url>.txt file
with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:
@ -152,6 +157,8 @@ def crawl(url):
# Otherwise, write the text to the file in the text directory
f.write(text)
except Exception as e:
print("Unable to parse page " + url)
# Get the hyperlinks from the URL and add them to the queue
for link in get_domain_hyperlinks(local_domain, url):