commit 190b9875c17508609978bafe4a88911a93bfc042
parent c172c20e952abe9f34c2c78446b7bcdb81dbbd71
Author: Natalie Pendragon <natpen@natpen.net>
Date: Mon, 20 Jul 2020 07:56:52 -0400
[crawl] Start indexing errors
Diffstat:
4 files changed, 60 insertions(+), 4 deletions(-)
diff --git a/gus/constants.py b/gus/constants.py
@@ -11,3 +11,5 @@ DEFAULT_NON_ROOT_CHANGE_FREQUENCY = 24 * 7
DEFAULT_REDIRECT_CHANGE_FREQUENCY = 24 * 7
DEFAULT_BINARY_CHANGE_FREQUENCY = 24 * 30
DEFAULT_PROMPT_CHANGE_FREQUENCY = 24 * 30
+DEFAULT_TEMP_ERROR_CHANGE_FREQUENCY = 24
+DEFAULT_PERM_ERROR_CHANGE_FREQUENCY = 24 * 30 * 3
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -181,6 +181,25 @@ def index_redirect(resource):
return page
+def index_error(resource, is_temporary):
+ print("INDEXING ERROR...")
+ doc = {
+ "url": resource.indexable_url,
+ "fetchable_url": resource.fetchable_url,
+ "domain": resource.normalized_host,
+ "port": resource.urlsplit.port or 1965,
+ "change_frequency": constants.DEFAULT_TEMP_ERROR_CHANGE_FREQUENCY if is_temporary else constants.DEFAULT_PERM_ERROR_CHANGE_FREQUENCY,
+ }
+ existing_page = Page.get_or_none(url=resource.indexable_url)
+ if existing_page:
+ doc["id"] = existing_page.id
+ doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"]
+ page = Page(**doc)
+ page.save()
+ return page
+
+
+
def index_prompt(resource, response):
print("INDEXING PROMPT...")
doc = {
@@ -342,6 +361,32 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red
# problem before getting a response
print("ERROR : %s" % gr.fetchable_url)
print("--------------------------")
+ elif response.status.startswith("4"):
+ # temporary error status
+ print("TEMP ERROR : %s" % response.url)
+ print("STATUS : %s" % response.status)
+ print("ERROR MSG : %s" % response.error_message)
+ print("--------------------------")
+ page = index_error(gr, True)
+ page_crawl = Crawl(page=page,
+ status=response.status,
+ is_different=False,
+ error_message=response.error_message,
+ timestamp=datetime.utcnow())
+ page_crawl.save()
+ elif response.status.startswith("5"):
+ # permanent error status
+ print("PERM ERROR : %s" % response.url)
+ print("STATUS : %s" % response.status)
+ print("ERROR MSG : %s" % response.error_message)
+ print("--------------------------")
+ page = index_error(gr, False)
+ page_crawl = Crawl(page=page,
+ status=response.status,
+ is_different=False,
+ error_message=response.error_message,
+ timestamp=datetime.utcnow())
+ page_crawl.save()
elif response.status.startswith("3"):
# redirect status
print("REDIRECT : %s -> %s" % (gr.fetchable_url, response.url))
@@ -398,7 +443,7 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red
page_crawl.save()
print("--------------------------")
else:
- # input, error, etc (all other statuses)
+ # any other unhandled statuses
print("UNHANDLED : %s" % gr.fetchable_url)
print("--------------------------")
@@ -415,7 +460,16 @@ def unpickle_robot_file_map(index_dir):
def load_expired_urls():
- expired_pages = Page.raw("SELECT DISTINCT p.url FROM page as p JOIN crawl as c ON p.id == c.page_id WHERE datetime(c.timestamp, REPLACE('fnord hours', 'fnord', p.change_frequency)) < datetime('now');")
+ expired_pages = Page.raw("""SELECT url
+FROM (
+ SELECT p.url, p.change_frequency, MAX(c.timestamp) as timestamp
+ FROM page as p
+ JOIN crawl as c
+ ON p.id == c.page_id
+ GROUP BY p.url
+)
+WHERE datetime(timestamp, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now');
+""")
return [page.url for page in expired_pages.execute()]
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -54,5 +54,6 @@ class Crawl(Model):
page = ForeignKeyField(Page, backref="crawls", on_delete='CASCADE')
status = IntegerField()
+ error_message = TextField()
is_different = BooleanField()
timestamp = DateTimeField()
diff --git a/poetry.lock b/poetry.lock
@@ -108,10 +108,9 @@ python-versions = "*"
version = "0.1.0"
[package.source]
-reference = "47e40dcabd58ec0bf6347b1285d0a846af86f3aa"
+reference = "a0376fdbc00da564d3a19837a283a9c4d4d625d9"
type = "git"
url = "https://git.sr.ht/~natpen/gusmobile"
-
[[package]]
category = "dev"
description = "Read metadata from Python packages"