commit e1b3ac8ab4740ef176a3995551bcacae6244588d
parent 7ce66303c32c12552db2dedc7e0ab536f85e0915
Author: René Wagner <rwa@clttr.info>
Date: Tue, 13 Jul 2021 13:21:28 +0200
enable 'newest-hosts' and 'newest-pages' sites again
closes #26
Diffstat:
7 files changed, 50 insertions(+), 13 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -137,3 +137,15 @@ dmypy.json
# certificates
/*.crt
/*.key
+
+.bash_history
+.bashrc
+.config/
+.gitconfig
+.local/
+.poetry/
+.sqlite_history
+.ssh/
+.vim/
+.viminfo
+.vimrc
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -50,10 +50,13 @@ def index_binary(resource, response):
"last_crawl_success_at": datetime.utcnow(),
"last_status" : response.status,
"last_stats_message" : response.error_message,
+ "first_seen_at" : datetime.utcnow()
}
existing_page = Page.get_or_none(url=resource.normalized_url)
if existing_page:
doc["id"] = existing_page.id
+ if not (existing_page.first_seen_at is None):
+ doc["first_seen_at"] = existing_page.first_seen_at
existing_change_frequency = (
existing_page.change_frequency
or resource.get_default_change_frequency("binary")
@@ -61,6 +64,11 @@ def index_binary(resource, response):
doc["change_frequency"] = resource.increment_change_frequency(
existing_change_frequency, "binary"
)
+
+ if ((existing_page is not None and existing_page.first_seen_at is None) or
+ existing_page is None):
+ doc["fist_seen_at"] = datetime.utcnow()
+
page = Page(**doc)
try:
page.save()
@@ -86,10 +94,13 @@ def index_redirect(resource, response):
"last_crawl_success_at": datetime.utcnow(),
"last_status" : response.status,
"last_stats_message" : response.error_message,
+ "first_seen_at" : datetime.utcnow()
}
existing_page = Page.get_or_none(url=resource.normalized_url)
if existing_page:
doc["id"] = existing_page.id
+ if not (existing_page.first_seen_at is None):
+ doc["first_seen_at"] = existing_page.first_seen_at
existing_change_frequency = (
existing_page.change_frequency
or resource.get_default_change_frequency("redirect")
@@ -97,6 +108,11 @@ def index_redirect(resource, response):
doc["change_frequency"] = resource.increment_change_frequency(
existing_change_frequency, "redirect"
)
+
+ if ((existing_page is not None and existing_page.first_seen_at is None) or
+ existing_page is None):
+ doc["fist_seen_at"] = datetime.utcnow()
+
page = Page(**doc)
try:
page.save()
@@ -157,10 +173,13 @@ def index_prompt(resource, response):
"last_crawl_success_at": datetime.utcnow(),
"last_status" : response.status,
"last_stats_message" : response.error_message,
+ "first_seen_at" : datetime.utcnow()
}
existing_page = Page.get_or_none(url=resource.normalized_url)
if existing_page:
doc["id"] = existing_page.id
+ if not (existing_page.first_seen_at is None):
+ doc["first_seen_at"] = existing_page.first_seen_at
existing_change_frequency = (
existing_page.change_frequency
or resource.get_default_change_frequency("prompt")
@@ -168,6 +187,11 @@ def index_prompt(resource, response):
doc["change_frequency"] = resource.increment_change_frequency(
existing_change_frequency, "prompt"
)
+
+ if ((existing_page is not None and existing_page.first_seen_at is None) or
+ existing_page is None):
+ doc["fist_seen_at"] = datetime.utcnow()
+
page = Page(**doc)
try:
page.save()
@@ -197,6 +221,7 @@ def index_content(resource, response):
"last_crawl_success_at": datetime.utcnow(),
"last_status" : response.status,
"last_stats_message" : response.error_message,
+ "first_seen_at" : datetime.utcnow()
}
if response.content_type == "text/gemini":
doc["lang"] = (response.lang or "none",)
@@ -204,6 +229,8 @@ def index_content(resource, response):
is_different = False
if existing_page:
doc["id"] = existing_page.id
+ if not (existing_page.first_seen_at is None):
+ doc["first_seen_at"] = existing_page.first_seen_at
if existing_page.content:
is_different = doc["content"] != existing_page.content
if is_different:
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -46,6 +46,7 @@ class Page(Model):
last_crawl_success_at = DateTimeField(null=True)
last_status = IntegerField(null=True)
last_status_message = TextField(null=True)
+ first_seen_at = DateTimeField(null=True)
class Link(Model):
"""
diff --git a/serve/models.py b/serve/models.py
@@ -24,7 +24,7 @@ class GUS:
)
def search_index(self, query, requested_page):
- Search.create(query=query, timestamp=datetime.utcnow())
+ #Search.create(query=query, timestamp=datetime.utcnow())
query = self.index.parse_query(query)
results = self.index.search(query, requested_page, pagelen=10)
return (
@@ -186,12 +186,12 @@ AND p.last_crawl_success_at IS NOT NULL"""
def get_newest_hosts(self):
newest_hosts_query = Page.raw(
"""
- SELECT p.domain, MIN(c.timestamp) AS first_seen
+ SELECT p.domain, p.first_seen_at
FROM page AS p
- INNER JOIN crawl AS c ON c.page_id = p.id
- WHERE c.status = 20
+ WHERE last_crawl_success_at IS NOT NULL
+ AND first_seen_at IS NOT NULL
GROUP BY p.domain
- ORDER BY first_seen DESC
+ ORDER BY first_seen_at DESC
LIMIT 10
"""
)
@@ -199,10 +199,9 @@ AND p.last_crawl_success_at IS NOT NULL"""
def get_newest_pages(self):
newest_pages_query = Page.raw(
- """SELECT p.url, p.fetchable_url AS first_seen
-FROM page as p
-GROUP BY p.url
-ORDER BY first_seen DESC
+ """SELECT p.url, p.fetchable_url, p.first_seen_at FROM page as p
+ WHERE last_crawl_success_at IS NOT NULL AND first_seen_at IS NOT NULL
+ORDER BY first_seen_at DESC
LIMIT 50
"""
)
diff --git a/serve/templates/index.gmi b/serve/templates/index.gmi
@@ -6,10 +6,8 @@
=> /statistics geminispace.info Statistics
=> /known-hosts Known Gemini Hosts
=> /known-feeds Known Gemini Feeds
-{% if (false) %}
=> /newest-hosts Newest Gemini Hosts
=> /newest-pages Newest Gemini Pages
-{% endif %}
## Help and Documentation
diff --git a/serve/templates/newest_hosts.gmi b/serve/templates/newest_hosts.gmi
@@ -6,7 +6,7 @@
Here are the ten most recently discovered Gemini hosts by geminispace.info.
{% for host in newest_hosts %}
-{{ "=> //{} {}: {}".format(host.domain, host.first_seen[:10], host.domain) }}
+{{ "=> //{} {}: {}".format(host.domain, host.first_seen_at.strftime('%Y-%m-%d'), host.domain) }}
{% endfor %}
{% include 'fragments/footer.gmi' %}
diff --git a/serve/templates/newest_pages.gmi b/serve/templates/newest_pages.gmi
@@ -6,7 +6,7 @@
Here are the fifty most recently discovered Gemini pages by geminispace.info.
{% for page in newest_pages %}
-{{ "=> {} {}: {}".format(page.fetchable_url, page.first_seen[:10], page.url) }}
+{{ "=> {} {}: {}".format(page.fetchable_url, page.first_seen_at.strftime('%Y-%m-%d'), page.url) }}
{% endfor %}
{% include 'fragments/footer.gmi' %}