Add normalized url to db - geminispace.info

commit 96b7b661fb2c5f5eed02e4475467ef61a1b4c251
parent 4cc4e66b6c01fe54d9e5992f7f6a2147d53eba2b
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Wed, 22 Jul 2020 13:29:00 -0400

Add normalized url to db

Diffstat:
M gus/build_index.py  | 33 ++++++++++++++++-----------------
M gus/crawl.py  | 11 ++++++++---
M gus/lib/db_model.py  | 3 ++-
M gus/lib/index_statistics.py  | 10 ++++------
M serve/models.py  | 25 ++++++++++++++-----------

5 files changed, 44 insertions(+), 38 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -116,16 +116,19 @@ def index_page(page, indexed_urls):
     if page.fetchable_url in indexed_urls:
         return
     print(page.url)
-    url = page.url.rstrip("/")
-    from_page = Page.alias()
-    to_page = Page.alias()
-    backlinks = (Link
-                 .select(from_page)
-                 .join(from_page, on=(from_page.id == Link.from_page_id))
-                 .join(to_page, on=(to_page.id == Link.to_page_id))
-                 .where(to_page.url << [url, f"{url}/"])
-                 .dicts())
-    backlink_urls = [b["url"] for b in backlinks]
+    u = page.url.rstrip("/")
+    backlinks = Page.raw("""SELECT p_from.url
+FROM page AS p_from
+JOIN indexable_crawl AS ic
+ON ic.page_id == p_from.id
+JOIN link as l
+ON l.from_page_id == p_from.id
+JOIN page as p_to
+ON p_to.id == l.to_page_id
+WHERE p_to.url IN (?, ?)
+GROUP BY p_from.normalized_url""", u, f"{u}/")
+
+    backlink_urls = [b.url for b in backlinks.execute()]
     backlink_count = len(list(set(backlink_urls)))
 
     document = {
@@ -182,15 +185,11 @@ def build_index(should_run_destructive=False, invalidation_window=0):
     invalidate_recent_results(invalidation_window)
     indexed_urls = [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT)
 
-    pages = Page.raw("""SELECT p.*, MAX(c.timestamp) AS crawl_timestamp FROM (
-  SELECT crawl.*, row_number()
-  OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS mostRecently
-  FROM crawl) AS c
+    pages = Page.raw("""SELECT p.*, MAX(c.timestamp) AS crawl_timestamp
+FROM indexable_crawl AS c
 JOIN page AS p
 ON p.id == c.page_id
-WHERE c.mostRecently < 3
-AND c.status == 20
-GROUP BY p.id""")
+GROUP BY p.normalized_url""")
 
     for page in pages.execute():
         index_page(page, indexed_urls)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -147,6 +147,7 @@ def index_binary(resource, response):
     doc = {
         "url": resource.indexable_url,
         "fetchable_url": resource.fetchable_url,
+        "normalized_url": resource.normalized_url,
         "domain": resource.normalized_host,
         "port": resource.urlsplit.port or 1965,
         "content_type": response.content_type,
@@ -168,6 +169,7 @@ def index_redirect(resource):
     doc = {
         "url": resource.indexable_url,
         "fetchable_url": resource.fetchable_url,
+        "normalized_url": resource.normalized_url,
         "domain": resource.normalized_host,
         "port": resource.urlsplit.port or 1965,
         "change_frequency": constants.DEFAULT_REDIRECT_CHANGE_FREQUENCY,
@@ -186,6 +188,7 @@ def index_error(resource, is_temporary):
     doc = {
         "url": resource.indexable_url,
         "fetchable_url": resource.fetchable_url,
+        "normalized_url": resource.normalized_url,
         "domain": resource.normalized_host,
         "port": resource.urlsplit.port or 1965,
         "change_frequency": constants.DEFAULT_TEMP_ERROR_CHANGE_FREQUENCY if is_temporary else constants.DEFAULT_PERM_ERROR_CHANGE_FREQUENCY,
@@ -205,6 +208,7 @@ def index_prompt(resource, response):
     doc = {
         "url": resource.indexable_url,
         "fetchable_url": resource.fetchable_url,
+        "normalized_url": resource.normalized_url,
         "domain": resource.normalized_host,
         "port": resource.urlsplit.port or 1965,
         "content_type": "input",
@@ -228,6 +232,7 @@ def index_content(resource, response):
     doc = {
         "url": resource.indexable_url,
         "fetchable_url": resource.fetchable_url,
+        "normalized_url": resource.normalized_url,
         "domain": resource.normalized_host,
         "port": resource.urlsplit.port or 1965,
         "content_type": response.content_type,
@@ -462,14 +467,14 @@ def unpickle_robot_file_map(index_dir):
 def load_expired_urls():
     expired_pages = Page.raw("""SELECT url
 FROM (
-  SELECT p.url, p.change_frequency, MAX(c.timestamp) as timestamp
+  SELECT p.url, p.normalized_url, p.change_frequency, MAX(c.timestamp) as timestamp
   FROM page as p
   JOIN crawl as c
   ON p.id == c.page_id
   GROUP BY p.url
 )
-WHERE datetime(timestamp, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now');
-""")
+WHERE datetime(timestamp, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now')
+GROUP BY normalized_url;""")
     return [page.url for page in expired_pages.execute()]
 
 
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -28,6 +28,7 @@ class Page(Model):
 
     url = TextField(unique=True, index=True)
     fetchable_url = TextField(null=True)
+    normalized_url = TextField(null=True)
     domain = TextField(null=True)
     port = IntegerField(null=True)
     content_type = TextField(null=True)
@@ -54,6 +55,6 @@ class Crawl(Model):
 
     page = ForeignKeyField(Page, backref="crawls", on_delete='CASCADE')
     status = IntegerField()
-    error_message = TextField()
+    error_message = TextField(null=True)
     is_different = BooleanField()
     timestamp = DateTimeField()
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -10,14 +10,12 @@ from gus.lib.db_model import Page, Link, Crawl
 from gus.lib.gemini import GeminiResource
 
 def compute_index_statistics(db):
-    valid_page_ids_query = Page.raw("""SELECT DISTINCT p.id FROM (
-  SELECT crawl.*, row_number()
-  OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS mostRecently
-  FROM crawl) AS c
+    valid_page_ids_query = Page.raw("""SELECT p.id
+FROM indexable_crawl AS c
 JOIN page AS p
 ON p.id == c.page_id
-WHERE c.mostRecently < 3
-AND c.status == 20""")
+GROUP BY p.normalized_url""")
+
     valid_page_ids = [p.id for p in valid_page_ids_query.execute()]
     page_count = len(valid_page_ids)
     domains_query = (Page
diff --git a/serve/models.py b/serve/models.py
@@ -1,12 +1,12 @@
 import re
 from urllib.parse import quote
 
-from peewee import SqliteDatabase
+from peewee import fn, SqliteDatabase
 from whoosh import highlight, qparser
 from whoosh.index import open_dir
 
 from . import constants
-from gus.lib.db_model import init_db, Page, Link
+from gus.lib.db_model import init_db, Page, Link, Crawl
 from gus.lib.gemini import GeminiResource
 from gus.lib.index_statistics import compute_index_statistics, load_all_statistics_from_file
 from gus.lib.misc import bytes2human
@@ -67,15 +67,18 @@ class GUS():
             return []
 
         u = resource.indexable_url.rstrip("/")
-        from_page = Page.alias()
-        to_page = Page.alias()
-        backlinks = (Link
-                     .select(from_page)
-                     .join(from_page, on=(from_page.id == Link.from_page_id))
-                     .join(to_page, on=(to_page.id == Link.to_page_id))
-                     .where(to_page.url << [u, f"{u}/"])
-                     .dicts())
-        backlink_urls = [b["url"] for b in backlinks]
+        backlinks = Page.raw("""SELECT p_from.url
+FROM page AS p_from
+JOIN indexable_crawl AS ic
+ON ic.page_id == p_from.id
+JOIN link as l
+ON l.from_page_id == p_from.id
+JOIN page as p_to
+ON p_to.id == l.to_page_id
+WHERE p_to.url IN (?, ?)
+GROUP BY p_from.normalized_url""", u, f"{u}/")
+
+        backlink_urls = [b.url for b in backlinks.execute()]
         return list(set(backlink_urls))

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/build_index.py	\|	33	++++++++++++++++-----------------
M	gus/crawl.py	\|	11	++++++++---
M	gus/lib/db_model.py	\|	3	++-
M	gus/lib/index_statistics.py	\|	10	++++------
M	serve/models.py	\|	25	++++++++++++++-----------