geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 190b9875c17508609978bafe4a88911a93bfc042
parent c172c20e952abe9f34c2c78446b7bcdb81dbbd71
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon, 20 Jul 2020 07:56:52 -0400

[crawl] Start indexing errors

Diffstat:
Mgus/constants.py | 2++
Mgus/crawl.py | 58++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Mgus/lib/db_model.py | 1+
Mpoetry.lock | 3+--
4 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/gus/constants.py b/gus/constants.py @@ -11,3 +11,5 @@ DEFAULT_NON_ROOT_CHANGE_FREQUENCY = 24 * 7 DEFAULT_REDIRECT_CHANGE_FREQUENCY = 24 * 7 DEFAULT_BINARY_CHANGE_FREQUENCY = 24 * 30 DEFAULT_PROMPT_CHANGE_FREQUENCY = 24 * 30 +DEFAULT_TEMP_ERROR_CHANGE_FREQUENCY = 24 +DEFAULT_PERM_ERROR_CHANGE_FREQUENCY = 24 * 30 * 3 diff --git a/gus/crawl.py b/gus/crawl.py @@ -181,6 +181,25 @@ def index_redirect(resource): return page +def index_error(resource, is_temporary): + print("INDEXING ERROR...") + doc = { + "url": resource.indexable_url, + "fetchable_url": resource.fetchable_url, + "domain": resource.normalized_host, + "port": resource.urlsplit.port or 1965, + "change_frequency": constants.DEFAULT_TEMP_ERROR_CHANGE_FREQUENCY if is_temporary else constants.DEFAULT_PERM_ERROR_CHANGE_FREQUENCY, + } + existing_page = Page.get_or_none(url=resource.indexable_url) + if existing_page: + doc["id"] = existing_page.id + doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"] + page = Page(**doc) + page.save() + return page + + + def index_prompt(resource, response): print("INDEXING PROMPT...") doc = { @@ -342,6 +361,32 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red # problem before getting a response print("ERROR : %s" % gr.fetchable_url) print("--------------------------") + elif response.status.startswith("4"): + # temporary error status + print("TEMP ERROR : %s" % response.url) + print("STATUS : %s" % response.status) + print("ERROR MSG : %s" % response.error_message) + print("--------------------------") + page = index_error(gr, True) + page_crawl = Crawl(page=page, + status=response.status, + is_different=False, + error_message=response.error_message, + timestamp=datetime.utcnow()) + page_crawl.save() + elif response.status.startswith("5"): + # permanent error status + print("PERM ERROR : %s" % response.url) + print("STATUS : %s" % response.status) + print("ERROR MSG : %s" % response.error_message) + print("--------------------------") + page = index_error(gr, False) + page_crawl = Crawl(page=page, + status=response.status, + is_different=False, + error_message=response.error_message, + timestamp=datetime.utcnow()) + page_crawl.save() elif response.status.startswith("3"): # redirect status print("REDIRECT : %s -> %s" % (gr.fetchable_url, response.url)) @@ -398,7 +443,7 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red page_crawl.save() print("--------------------------") else: - # input, error, etc (all other statuses) + # any other unhandled statuses print("UNHANDLED : %s" % gr.fetchable_url) print("--------------------------") @@ -415,7 +460,16 @@ def unpickle_robot_file_map(index_dir): def load_expired_urls(): - expired_pages = Page.raw("SELECT DISTINCT p.url FROM page as p JOIN crawl as c ON p.id == c.page_id WHERE datetime(c.timestamp, REPLACE('fnord hours', 'fnord', p.change_frequency)) < datetime('now');") + expired_pages = Page.raw("""SELECT url +FROM ( + SELECT p.url, p.change_frequency, MAX(c.timestamp) as timestamp + FROM page as p + JOIN crawl as c + ON p.id == c.page_id + GROUP BY p.url +) +WHERE datetime(timestamp, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now'); +""") return [page.url for page in expired_pages.execute()] diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py @@ -54,5 +54,6 @@ class Crawl(Model): page = ForeignKeyField(Page, backref="crawls", on_delete='CASCADE') status = IntegerField() + error_message = TextField() is_different = BooleanField() timestamp = DateTimeField() diff --git a/poetry.lock b/poetry.lock @@ -108,10 +108,9 @@ python-versions = "*" version = "0.1.0" [package.source] -reference = "47e40dcabd58ec0bf6347b1285d0a846af86f3aa" +reference = "a0376fdbc00da564d3a19837a283a9c4d4d625d9" type = "git" url = "https://git.sr.ht/~natpen/gusmobile" - [[package]] category = "dev" description = "Read metadata from Python packages"