geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 86bf28edff91ef55461ae8f27492dfadd8df8b22
parent 8a0c456fb9c75aa7c0930da9ecd67a53bc623749
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Tue, 11 Aug 2020 08:30:50 -0400

[crawl] Print change_frequency

Diffstat:
Mgus/crawl.py | 4++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -325,8 +325,6 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red return if should_check_if_expired: - # expired_pages = Page.raw("SELECT p.* FROM page as p JOIN crawl as c ON p.id == c.page_id WHERE datetime(c.timestamp, REPLACE('fnord hours', 'fnord', p.change_frequency)) < datetime('now');") - # return [page.url for page in expired_pages.execute()] existing_page = Page.get_or_none(url=gr.indexable_url) if existing_page and existing_page.change_frequency is not None: most_recent_crawl = Crawl.select(fn.MAX(Crawl.timestamp)).where(Crawl.page == existing_page).scalar() @@ -334,6 +332,8 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red print("TOO SOON : %s" % gr.fetchable_url) print("--------------------------") return + else: + print("Recrawling after %s hours" % existing_page.change_frequency) # ROBOTS robots_file = get_robots_file(gr.normalized_host)