commit 96b7b661fb2c5f5eed02e4475467ef61a1b4c251
parent 4cc4e66b6c01fe54d9e5992f7f6a2147d53eba2b
Author: Natalie Pendragon <natpen@natpen.net>
Date: Wed, 22 Jul 2020 13:29:00 -0400
Add normalized url to db
Diffstat:
5 files changed, 44 insertions(+), 38 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -116,16 +116,19 @@ def index_page(page, indexed_urls):
if page.fetchable_url in indexed_urls:
return
print(page.url)
- url = page.url.rstrip("/")
- from_page = Page.alias()
- to_page = Page.alias()
- backlinks = (Link
- .select(from_page)
- .join(from_page, on=(from_page.id == Link.from_page_id))
- .join(to_page, on=(to_page.id == Link.to_page_id))
- .where(to_page.url << [url, f"{url}/"])
- .dicts())
- backlink_urls = [b["url"] for b in backlinks]
+ u = page.url.rstrip("/")
+ backlinks = Page.raw("""SELECT p_from.url
+FROM page AS p_from
+JOIN indexable_crawl AS ic
+ON ic.page_id == p_from.id
+JOIN link as l
+ON l.from_page_id == p_from.id
+JOIN page as p_to
+ON p_to.id == l.to_page_id
+WHERE p_to.url IN (?, ?)
+GROUP BY p_from.normalized_url""", u, f"{u}/")
+
+ backlink_urls = [b.url for b in backlinks.execute()]
backlink_count = len(list(set(backlink_urls)))
document = {
@@ -182,15 +185,11 @@ def build_index(should_run_destructive=False, invalidation_window=0):
invalidate_recent_results(invalidation_window)
indexed_urls = [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT)
- pages = Page.raw("""SELECT p.*, MAX(c.timestamp) AS crawl_timestamp FROM (
- SELECT crawl.*, row_number()
- OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS mostRecently
- FROM crawl) AS c
+ pages = Page.raw("""SELECT p.*, MAX(c.timestamp) AS crawl_timestamp
+FROM indexable_crawl AS c
JOIN page AS p
ON p.id == c.page_id
-WHERE c.mostRecently < 3
-AND c.status == 20
-GROUP BY p.id""")
+GROUP BY p.normalized_url""")
for page in pages.execute():
index_page(page, indexed_urls)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -147,6 +147,7 @@ def index_binary(resource, response):
doc = {
"url": resource.indexable_url,
"fetchable_url": resource.fetchable_url,
+ "normalized_url": resource.normalized_url,
"domain": resource.normalized_host,
"port": resource.urlsplit.port or 1965,
"content_type": response.content_type,
@@ -168,6 +169,7 @@ def index_redirect(resource):
doc = {
"url": resource.indexable_url,
"fetchable_url": resource.fetchable_url,
+ "normalized_url": resource.normalized_url,
"domain": resource.normalized_host,
"port": resource.urlsplit.port or 1965,
"change_frequency": constants.DEFAULT_REDIRECT_CHANGE_FREQUENCY,
@@ -186,6 +188,7 @@ def index_error(resource, is_temporary):
doc = {
"url": resource.indexable_url,
"fetchable_url": resource.fetchable_url,
+ "normalized_url": resource.normalized_url,
"domain": resource.normalized_host,
"port": resource.urlsplit.port or 1965,
"change_frequency": constants.DEFAULT_TEMP_ERROR_CHANGE_FREQUENCY if is_temporary else constants.DEFAULT_PERM_ERROR_CHANGE_FREQUENCY,
@@ -205,6 +208,7 @@ def index_prompt(resource, response):
doc = {
"url": resource.indexable_url,
"fetchable_url": resource.fetchable_url,
+ "normalized_url": resource.normalized_url,
"domain": resource.normalized_host,
"port": resource.urlsplit.port or 1965,
"content_type": "input",
@@ -228,6 +232,7 @@ def index_content(resource, response):
doc = {
"url": resource.indexable_url,
"fetchable_url": resource.fetchable_url,
+ "normalized_url": resource.normalized_url,
"domain": resource.normalized_host,
"port": resource.urlsplit.port or 1965,
"content_type": response.content_type,
@@ -462,14 +467,14 @@ def unpickle_robot_file_map(index_dir):
def load_expired_urls():
expired_pages = Page.raw("""SELECT url
FROM (
- SELECT p.url, p.change_frequency, MAX(c.timestamp) as timestamp
+ SELECT p.url, p.normalized_url, p.change_frequency, MAX(c.timestamp) as timestamp
FROM page as p
JOIN crawl as c
ON p.id == c.page_id
GROUP BY p.url
)
-WHERE datetime(timestamp, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now');
-""")
+WHERE datetime(timestamp, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now')
+GROUP BY normalized_url;""")
return [page.url for page in expired_pages.execute()]
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -28,6 +28,7 @@ class Page(Model):
url = TextField(unique=True, index=True)
fetchable_url = TextField(null=True)
+ normalized_url = TextField(null=True)
domain = TextField(null=True)
port = IntegerField(null=True)
content_type = TextField(null=True)
@@ -54,6 +55,6 @@ class Crawl(Model):
page = ForeignKeyField(Page, backref="crawls", on_delete='CASCADE')
status = IntegerField()
- error_message = TextField()
+ error_message = TextField(null=True)
is_different = BooleanField()
timestamp = DateTimeField()
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -10,14 +10,12 @@ from gus.lib.db_model import Page, Link, Crawl
from gus.lib.gemini import GeminiResource
def compute_index_statistics(db):
- valid_page_ids_query = Page.raw("""SELECT DISTINCT p.id FROM (
- SELECT crawl.*, row_number()
- OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS mostRecently
- FROM crawl) AS c
+ valid_page_ids_query = Page.raw("""SELECT p.id
+FROM indexable_crawl AS c
JOIN page AS p
ON p.id == c.page_id
-WHERE c.mostRecently < 3
-AND c.status == 20""")
+GROUP BY p.normalized_url""")
+
valid_page_ids = [p.id for p in valid_page_ids_query.execute()]
page_count = len(valid_page_ids)
domains_query = (Page
diff --git a/serve/models.py b/serve/models.py
@@ -1,12 +1,12 @@
import re
from urllib.parse import quote
-from peewee import SqliteDatabase
+from peewee import fn, SqliteDatabase
from whoosh import highlight, qparser
from whoosh.index import open_dir
from . import constants
-from gus.lib.db_model import init_db, Page, Link
+from gus.lib.db_model import init_db, Page, Link, Crawl
from gus.lib.gemini import GeminiResource
from gus.lib.index_statistics import compute_index_statistics, load_all_statistics_from_file
from gus.lib.misc import bytes2human
@@ -67,15 +67,18 @@ class GUS():
return []
u = resource.indexable_url.rstrip("/")
- from_page = Page.alias()
- to_page = Page.alias()
- backlinks = (Link
- .select(from_page)
- .join(from_page, on=(from_page.id == Link.from_page_id))
- .join(to_page, on=(to_page.id == Link.to_page_id))
- .where(to_page.url << [u, f"{u}/"])
- .dicts())
- backlink_urls = [b["url"] for b in backlinks]
+ backlinks = Page.raw("""SELECT p_from.url
+FROM page AS p_from
+JOIN indexable_crawl AS ic
+ON ic.page_id == p_from.id
+JOIN link as l
+ON l.from_page_id == p_from.id
+JOIN page as p_to
+ON p_to.id == l.to_page_id
+WHERE p_to.url IN (?, ?)
+GROUP BY p_from.normalized_url""", u, f"{u}/")
+
+ backlink_urls = [b.url for b in backlinks.execute()]
return list(set(backlink_urls))