geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit e1b3ac8ab4740ef176a3995551bcacae6244588d
parent 7ce66303c32c12552db2dedc7e0ab536f85e0915
Author: René Wagner <rwa@clttr.info>
Date:   Tue, 13 Jul 2021 13:21:28 +0200

enable 'newest-hosts' and 'newest-pages' sites again

closes #26

Diffstat:
M.gitignore | 12++++++++++++
Mgus/crawl.py | 27+++++++++++++++++++++++++++
Mgus/lib/db_model.py | 1+
Mserve/models.py | 17++++++++---------
Mserve/templates/index.gmi | 2--
Mserve/templates/newest_hosts.gmi | 2+-
Mserve/templates/newest_pages.gmi | 2+-
7 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -137,3 +137,15 @@ dmypy.json # certificates /*.crt /*.key + +.bash_history +.bashrc +.config/ +.gitconfig +.local/ +.poetry/ +.sqlite_history +.ssh/ +.vim/ +.viminfo +.vimrc diff --git a/gus/crawl.py b/gus/crawl.py @@ -50,10 +50,13 @@ def index_binary(resource, response): "last_crawl_success_at": datetime.utcnow(), "last_status" : response.status, "last_stats_message" : response.error_message, + "first_seen_at" : datetime.utcnow() } existing_page = Page.get_or_none(url=resource.normalized_url) if existing_page: doc["id"] = existing_page.id + if not (existing_page.first_seen_at is None): + doc["first_seen_at"] = existing_page.first_seen_at existing_change_frequency = ( existing_page.change_frequency or resource.get_default_change_frequency("binary") @@ -61,6 +64,11 @@ def index_binary(resource, response): doc["change_frequency"] = resource.increment_change_frequency( existing_change_frequency, "binary" ) + + if ((existing_page is not None and existing_page.first_seen_at is None) or + existing_page is None): + doc["fist_seen_at"] = datetime.utcnow() + page = Page(**doc) try: page.save() @@ -86,10 +94,13 @@ def index_redirect(resource, response): "last_crawl_success_at": datetime.utcnow(), "last_status" : response.status, "last_stats_message" : response.error_message, + "first_seen_at" : datetime.utcnow() } existing_page = Page.get_or_none(url=resource.normalized_url) if existing_page: doc["id"] = existing_page.id + if not (existing_page.first_seen_at is None): + doc["first_seen_at"] = existing_page.first_seen_at existing_change_frequency = ( existing_page.change_frequency or resource.get_default_change_frequency("redirect") @@ -97,6 +108,11 @@ def index_redirect(resource, response): doc["change_frequency"] = resource.increment_change_frequency( existing_change_frequency, "redirect" ) + + if ((existing_page is not None and existing_page.first_seen_at is None) or + existing_page is None): + doc["fist_seen_at"] = datetime.utcnow() + page = Page(**doc) try: page.save() @@ -157,10 +173,13 @@ def index_prompt(resource, response): "last_crawl_success_at": datetime.utcnow(), "last_status" : response.status, "last_stats_message" : response.error_message, + "first_seen_at" : datetime.utcnow() } existing_page = Page.get_or_none(url=resource.normalized_url) if existing_page: doc["id"] = existing_page.id + if not (existing_page.first_seen_at is None): + doc["first_seen_at"] = existing_page.first_seen_at existing_change_frequency = ( existing_page.change_frequency or resource.get_default_change_frequency("prompt") @@ -168,6 +187,11 @@ def index_prompt(resource, response): doc["change_frequency"] = resource.increment_change_frequency( existing_change_frequency, "prompt" ) + + if ((existing_page is not None and existing_page.first_seen_at is None) or + existing_page is None): + doc["fist_seen_at"] = datetime.utcnow() + page = Page(**doc) try: page.save() @@ -197,6 +221,7 @@ def index_content(resource, response): "last_crawl_success_at": datetime.utcnow(), "last_status" : response.status, "last_stats_message" : response.error_message, + "first_seen_at" : datetime.utcnow() } if response.content_type == "text/gemini": doc["lang"] = (response.lang or "none",) @@ -204,6 +229,8 @@ def index_content(resource, response): is_different = False if existing_page: doc["id"] = existing_page.id + if not (existing_page.first_seen_at is None): + doc["first_seen_at"] = existing_page.first_seen_at if existing_page.content: is_different = doc["content"] != existing_page.content if is_different: diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py @@ -46,6 +46,7 @@ class Page(Model): last_crawl_success_at = DateTimeField(null=True) last_status = IntegerField(null=True) last_status_message = TextField(null=True) + first_seen_at = DateTimeField(null=True) class Link(Model): """ diff --git a/serve/models.py b/serve/models.py @@ -24,7 +24,7 @@ class GUS: ) def search_index(self, query, requested_page): - Search.create(query=query, timestamp=datetime.utcnow()) + #Search.create(query=query, timestamp=datetime.utcnow()) query = self.index.parse_query(query) results = self.index.search(query, requested_page, pagelen=10) return ( @@ -186,12 +186,12 @@ AND p.last_crawl_success_at IS NOT NULL""" def get_newest_hosts(self): newest_hosts_query = Page.raw( """ - SELECT p.domain, MIN(c.timestamp) AS first_seen + SELECT p.domain, p.first_seen_at FROM page AS p - INNER JOIN crawl AS c ON c.page_id = p.id - WHERE c.status = 20 + WHERE last_crawl_success_at IS NOT NULL + AND first_seen_at IS NOT NULL GROUP BY p.domain - ORDER BY first_seen DESC + ORDER BY first_seen_at DESC LIMIT 10 """ ) @@ -199,10 +199,9 @@ AND p.last_crawl_success_at IS NOT NULL""" def get_newest_pages(self): newest_pages_query = Page.raw( - """SELECT p.url, p.fetchable_url AS first_seen -FROM page as p -GROUP BY p.url -ORDER BY first_seen DESC + """SELECT p.url, p.fetchable_url, p.first_seen_at FROM page as p + WHERE last_crawl_success_at IS NOT NULL AND first_seen_at IS NOT NULL +ORDER BY first_seen_at DESC LIMIT 50 """ ) diff --git a/serve/templates/index.gmi b/serve/templates/index.gmi @@ -6,10 +6,8 @@ => /statistics geminispace.info Statistics => /known-hosts Known Gemini Hosts => /known-feeds Known Gemini Feeds -{% if (false) %} => /newest-hosts Newest Gemini Hosts => /newest-pages Newest Gemini Pages -{% endif %} ## Help and Documentation diff --git a/serve/templates/newest_hosts.gmi b/serve/templates/newest_hosts.gmi @@ -6,7 +6,7 @@ Here are the ten most recently discovered Gemini hosts by geminispace.info. {% for host in newest_hosts %} -{{ "=> //{} {}: {}".format(host.domain, host.first_seen[:10], host.domain) }} +{{ "=> //{} {}: {}".format(host.domain, host.first_seen_at.strftime('%Y-%m-%d'), host.domain) }} {% endfor %} {% include 'fragments/footer.gmi' %} diff --git a/serve/templates/newest_pages.gmi b/serve/templates/newest_pages.gmi @@ -6,7 +6,7 @@ Here are the fifty most recently discovered Gemini pages by geminispace.info. {% for page in newest_pages %} -{{ "=> {} {}: {}".format(page.fetchable_url, page.first_seen[:10], page.url) }} +{{ "=> {} {}: {}".format(page.fetchable_url, page.first_seen_at.strftime('%Y-%m-%d'), page.url) }} {% endfor %} {% include 'fragments/footer.gmi' %}