mirror of
https://github.com/james-m-jordan/openai-cookbook.git
synced 2025-05-09 19:32:38 +00:00
Catch the exception thrown by the With.Open and continue with the queue (#155)
Co-authored-by: Ashok Manghat <amanghat@rmplc.net>
This commit is contained in:
parent
18b7ee5f37
commit
39b62a6c09
@ -20,6 +20,9 @@ from ast import literal_eval
|
||||
# Regex pattern to match a URL
|
||||
HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$'
|
||||
|
||||
# Define OpenAI api_key
|
||||
# openai.api_key = '<Your API Key>'
|
||||
|
||||
# Define root domain to crawl
|
||||
domain = "openai.com"
|
||||
full_url = "https://openai.com/"
|
||||
@ -136,22 +139,26 @@ def crawl(url):
|
||||
# Get the next URL from the queue
|
||||
url = queue.pop()
|
||||
print(url) # for debugging and to see the progress
|
||||
|
||||
# Try extracting the text from the link, if failed proceed with the next item in the queue
|
||||
try:
|
||||
# Save text from the url to a <url>.txt file
|
||||
with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:
|
||||
|
||||
# Save text from the url to a <url>.txt file
|
||||
with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:
|
||||
# Get the text from the URL using BeautifulSoup
|
||||
soup = BeautifulSoup(requests.get(url).text, "html.parser")
|
||||
|
||||
# Get the text from the URL using BeautifulSoup
|
||||
soup = BeautifulSoup(requests.get(url).text, "html.parser")
|
||||
# Get the text but remove the tags
|
||||
text = soup.get_text()
|
||||
|
||||
# Get the text but remove the tags
|
||||
text = soup.get_text()
|
||||
|
||||
# If the crawler gets to a page that requires JavaScript, it will stop the crawl
|
||||
if ("You need to enable JavaScript to run this app." in text):
|
||||
print("Unable to parse page " + url + " due to JavaScript being required")
|
||||
# If the crawler gets to a page that requires JavaScript, it will stop the crawl
|
||||
if ("You need to enable JavaScript to run this app." in text):
|
||||
print("Unable to parse page " + url + " due to JavaScript being required")
|
||||
|
||||
# Otherwise, write the text to the file in the text directory
|
||||
f.write(text)
|
||||
# Otherwise, write the text to the file in the text directory
|
||||
f.write(text)
|
||||
except Exception as e:
|
||||
print("Unable to parse page " + url)
|
||||
|
||||
# Get the hyperlinks from the URL and add them to the queue
|
||||
for link in get_domain_hyperlinks(local_domain, url):
|
||||
|
Loading…
x
Reference in New Issue
Block a user