remove Crawl table, all info is stored in page table now - geminispace.info

commit b5bf01a4454e1e904d09499038ef80ea8b5255e9
parent 9efd819e3e5317965a012192bc1ec2d4fd789515
Author: René Wagner <rwa@clttr.info>
Date:   Sun, 11 Jul 2021 09:05:01 +0200

remove Crawl table, all info is stored in page table now

Diffstat:
M gus/build_index.py  | 56 +++++++++++++++++++++++++++++++-------------------------
M gus/crawl.py  | 123 +++++++++++++++++++++----------------------------------------------------------
M gus/lib/db_model.py  | 29 +++++------------------------
M gus/lib/index_statistics.py  | 26 +++++++++-----------------
M serve/templates/news.gmi  | 7 +++++++

5 files changed, 84 insertions(+), 157 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -24,19 +24,13 @@ uses_relative.append("gemini")
 uses_netloc.append("gemini")
 
 
-def index_page(index, page, indexed_urls):
+def index_page(index, page, should_run_desctructive):
     if should_skip(GeminiResource(page.url)):
         logging.debug(
             "URL is excluded, skipping: %s",
             strip_control_chars(page.url),
         )
         return False
-    if page.fetchable_url in indexed_urls:
-        logging.debug(
-            "Page already indexed, skipping: %s",
-            strip_control_chars(page.url),
-        )
-        return False
 
     logging.info("Indexing page: %s", strip_control_chars(page.url))
 
@@ -47,8 +41,7 @@ FROM page AS p_from
 JOIN link as l ON l.from_page_id == p_from.id
 JOIN page as p_to ON p_to.id == l.to_page_id
 WHERE p_to.url IN (?, ?)
-AND l.is_cross_host_like == 1
-GROUP BY p_from.normalized_url""",
+AND l.is_cross_host_like == 1""",
         u,
         f"{u}/",
     )
@@ -66,13 +59,20 @@ GROUP BY p_from.normalized_url""",
         "charset": page.charset or "none",
         "lang": page.lang,
         "size": page.size,
-        "indexed_at": datetime.fromisoformat(page.crawl_timestamp),
+        "indexed_at": datetime.utcnow(),
         "backlink_count": backlink_count,
         "prompt": page.prompt,
         "content": page.content,
     }
     try:
-        index.add_document(document)
+        if (page.indexed_at is None or should_run_destructive):
+            index.add_document(document)
+        else:
+            index.update_document(document)
+
+        page.indexed_at=datetime.utcnow()
+        page.save()
+        
         return True
     except Exception as e:
         logging.exception(
@@ -94,32 +94,38 @@ def invalidate_recent_results(index, invalidation_window):
 
 
 def build_index(should_run_destructive=False, invalidation_window=0):
-    #index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR
-    index_dir = constants.INDEX_DIR_NEW
+    index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR
+#    index_dir = constants.INDEX_DIR_NEW
 
     db = init_db(index_dir + "/gus.sqlite")
     index = search.Index(index_dir, should_run_destructive)
 
     invalidate_recent_results(index, invalidation_window)
-    indexed_urls = ([] if should_run_destructive else index.indexed_urls())
-
-    pages = Page.raw(
-        """SELECT p.*, MAX(c.timestamp) AS crawl_timestamp
-FROM indexable_crawl AS c
-JOIN page AS p
-ON p.id == c.page_id
-WHERE p.content_type NOT LIKE 'text/%'
-OR (p.content_type LIKE 'text/%' AND p.size <= ?)
-GROUP BY p.normalized_url""", constants.MAXIMUM_TEXT_PAGE_SIZE
+
+    if (should_run_destructive):
+        pages = Page.raw(
+        """SELECT p.* FROM page AS p
+WHERE p.last_status == 20 
+AND (p.content_type NOT LIKE 'text/%'
+OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE
+    )
+    else:
+        pages = Page.raw(
+        """SELECT p.* FROM page AS p
+WHERE p.last_status == 20 
+AND (p.indexed_at IS NULL OR 
+p.indexed_at < p.last_crawl_success_at)
+AND (p.content_type NOT LIKE 'text/%'
+OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE
     )
 
     for page in pages.iterator():
-        index_page(index, page, indexed_urls)
+        index_page(index, page, should_run_destructive)
 
     try:
         index.close()
     except Exception as e:
-        logging.error('Closing of inde failed: %s', e);
+        logging.error('Closing of index failed: %s', e);
  
     index_statistics = compute_index_statistics(db)
     log_index_statistics(index_statistics)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -19,7 +19,7 @@ import peewee
 
 from gus.excludes import EXCLUDED_URL_PREFIXES, EXCLUDED_URL_PATHS
 from . import constants
-from gus.lib.db_model import init_db, Page, Link, Crawl
+from gus.lib.db_model import init_db, Page, Link
 from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
 import gus.lib.logging
 
@@ -54,6 +54,10 @@ def index_binary(resource, response):
         "charset": response.charset,
         "size": response.num_bytes,
         "change_frequency": resource.get_default_change_frequency("binary"),
+        "last_crawl_at": datetime.utcnow(),
+        "last_crawl_success_at": datetime.utcnow(),
+        "last_status" : response.status,
+        "last_stats_message" : response.error_message,
     }
     existing_page = Page.get_or_none(url=resource.indexable_url)
     if existing_page:
@@ -74,7 +78,7 @@ def index_binary(resource, response):
     return page
 
 
-def index_redirect(resource):
+def index_redirect(resource, response):
     logging.debug(
         "Indexing redirect for: %s",
         gus.lib.logging.strip_control_chars(resource.indexable_url),
@@ -87,6 +91,10 @@ def index_redirect(resource):
         "domain": resource.normalized_host,
         "port": resource.urlsplit.port or 1965,
         "change_frequency": resource.get_default_change_frequency("redirect"),
+        "last_crawl_at": datetime.utcnow(),
+        "last_crawl_success_at": datetime.utcnow(),
+        "last_status" : response.status,
+        "last_stats_message" : response.error_message,
     }
     existing_page = Page.get_or_none(url=resource.indexable_url)
     if existing_page:
@@ -107,7 +115,7 @@ def index_redirect(resource):
     return page
 
 
-def index_error(resource, is_temporary):
+def index_error(resource, is_temporary, response):
     category = "temp_error" if is_temporary else "perm_error"
     default_change_frequency = resource.get_default_change_frequency(category)
     doc = {
@@ -117,6 +125,10 @@ def index_error(resource, is_temporary):
         "domain": resource.normalized_host,
         "port": resource.urlsplit.port or 1965,
         "change_frequency": default_change_frequency,
+        "last_crawl_at": datetime.utcnow(),
+        "last_crawl_success_at": datetime.utcnow(),
+        "last_status" : None,
+        "last_status_message" : None,
     }
     existing_page = Page.get_or_none(url=resource.indexable_url)
     if existing_page:
@@ -153,6 +165,10 @@ def index_prompt(resource, response):
         "size": response.num_bytes,
         "prompt": response.prompt,
         "change_frequency": resource.get_default_change_frequency("prompt"),
+        "last_crawl_at": datetime.utcnow(),
+        "last_crawl_success_at": datetime.utcnow(),
+        "last_status" : response.status,
+        "last_stats_message" : response.error_message,
     }
     existing_page = Page.get_or_none(url=resource.indexable_url)
     if existing_page:
@@ -190,6 +206,10 @@ def index_content(resource, response):
         "content": response.content if response.num_bytes <= constants.MAXIMUM_TEXT_PAGE_SIZE else None,
         "size": response.num_bytes,
         "change_frequency": resource.get_default_change_frequency("content"),
+        "last_crawl_at": datetime.utcnow(),
+        "last_crawl_success_at": datetime.utcnow(),
+        "last_status" : response.status,
+        "last_stats_message" : response.error_message,
     }
     if response.content_type == "text/gemini":
         doc["lang"] = (response.lang or "none",)
@@ -313,11 +333,7 @@ def crawl_page(
     if should_check_if_expired:
         existing_page = Page.get_or_none(url=gr.indexable_url)
         if existing_page and existing_page.change_frequency is not None:
-            most_recent_crawl = (
-                Crawl.select(peewee.fn.MAX(Crawl.timestamp))
-                .where(Crawl.page == existing_page)
-                .scalar()
-            )
+            most_recent_crawl = existing_page.last_crawl_at
             if most_recent_crawl and datetime.now() < most_recent_crawl + timedelta(
                 hours=existing_page.change_frequency
             ):
@@ -347,7 +363,7 @@ def crawl_page(
             )
             return
 
-    # Crawl delay
+    # crawl delay
     if gr.normalized_host in domain_hit_timings:
         if gr.normalized_host in CRAWL_DELAYS:
             next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(
@@ -377,14 +393,7 @@ def crawl_page(
     if response is None:
         # problem before getting a response
         logging.warn("Failed to fetch: %s", gus.lib.logging.strip_control_chars(url))
-        page = index_error(gr, True)
-        page_crawl = Crawl(
-            page=page, status=0, is_different=False, timestamp=datetime.utcnow()
-        )
-        try:
-            page_crawl.save()
-        except:
-            logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url))
+        page = index_error(gr, True, response)
         
         failure_count[gr.normalized_host] = failure_count[gr.normalized_host] + 1 if gr.normalized_host in failure_count else 1
         logging.debug("Failed request count for host %s is %d", gr.normalized_host, failure_count[gr.normalized_host])
@@ -399,18 +408,7 @@ def crawl_page(
             response.status,
             response.error_message,
         )
-        page = index_error(gr, True)
-        page_crawl = Crawl(
-            page=page,
-            status=response.status,
-            is_different=False,
-            error_message=response.error_message,
-            timestamp=datetime.utcnow(),
-        )
-        try:
-            page_crawl.save()
-        except:
-            logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url))
+        page = index_error(gr, True, response)
 
     elif response.status.startswith("5"):
         # permanent error status
@@ -420,18 +418,7 @@ def crawl_page(
             response.status,
             response.error_message,
         )
-        page = index_error(gr, False)
-        page_crawl = Crawl(
-            page=page,
-            status=response.status,
-            is_different=False,
-            error_message=response.error_message,
-            timestamp=datetime.utcnow(),
-        )
-        try:
-            page_crawl.save()
-        except:
-            logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url))
+        page = index_error(gr, False, response)
 
     elif response.status.startswith("3"):
         # redirect status
@@ -456,17 +443,7 @@ def crawl_page(
                 gus.lib.logging.strip_control_chars(url),
             )
             return
-        page = index_redirect(gr)
-        page_crawl = Crawl(
-            page=page,
-            status=response.status,
-            is_different=False,
-            timestamp=datetime.utcnow(),
-        )
-        try:
-            page_crawl.save()
-        except:
-            logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url))
+        page = index_redirect(gr, response)
         index_links(gr, [redirect_resource])
         crawl_page(
             redirect_resource,
@@ -484,16 +461,6 @@ def crawl_page(
             response.prompt,
         )
         page = index_prompt(gr, response)
-        page_crawl = Crawl(
-            page=page,
-            status=response.status,
-            is_different=False,
-            timestamp=datetime.utcnow(),
-        )
-        try:
-            page_crawl.save()
-        except:
-            logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url))
     elif response.status.startswith("2"):
         # success status
         logging.debug(
@@ -504,16 +471,6 @@ def crawl_page(
         )
         if response.content_type.startswith("text/"):
             page, is_different = index_content(gr, response)
-            page_crawl = Crawl(
-                page=page,
-                status=response.status,
-                is_different=is_different,
-                timestamp=datetime.utcnow(),
-            )
-            try:
-                page_crawl.save()
-            except:
-                logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url))
             if response.content_type != "text/gemini":
                 logging.debug(
                     "Content is not gemini text: %s: %s",
@@ -533,16 +490,6 @@ def crawl_page(
                     )
         else:
             page = index_binary(gr, response)
-            page_crawl = Crawl(
-                page=page,
-                status=response.status,
-                is_different=False,
-                timestamp=datetime.utcnow(),
-            )
-            try:
-                page_crawl.save()
-            except:
-                logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url))
     else:
         logging.warn(
             "Got unhandled status: %s: %s",
@@ -553,15 +500,9 @@ def crawl_page(
 def load_expired_urls():
     expired_pages = Page.raw(
         """SELECT url
-FROM (
-  SELECT p.url, p.normalized_url, p.change_frequency, MAX(c.timestamp) as timestamp
   FROM page as p
-  JOIN crawl as c
-  ON p.id == c.page_id
-  GROUP BY p.url
-)
-WHERE datetime(timestamp, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now')
-GROUP BY normalized_url;"""
+WHERE datetime(last_crawl_at, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now')
+"""
     )
     return [page.url for page in expired_pages.execute()]
 
@@ -668,7 +609,7 @@ def recrawl_feeds():
 
 def run_crawl(should_run_destructive=False, seed_urls=[]):
     global index_dir
-    index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR
+    index_dir = constants.INDEX_DIR if should_run_destructive else constants.INDEX_DIR
     pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True)
     global db
     db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -17,19 +17,10 @@ def init_db(filename=":memory:"):
     """
     Bind an SQLite database to the Peewee ORM models.
     """
-    models = [Crawl, Link, Page, Search, Thread, ThreadPage]
+    models = [Link, Page, Search, Thread, ThreadPage]
     db = SqliteDatabase(filename)
     db.bind(models)
     db.create_tables(models)
-    db.execute_sql(
-        """CREATE VIEW IF NOT EXISTS indexable_crawl AS
-SELECT c.* FROM (
-  SELECT crawl.*, row_number()
-  OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS rank
-  FROM crawl) AS c
-WHERE c.rank < 3
-AND c.status == 20;"""
-    )
     return db
 
 
@@ -52,7 +43,10 @@ class Page(Model):
     size = IntegerField(null=True)  # in bytes
     change_frequency = IntegerField(null=True)  # in hours
     indexed_at = DateTimeField(null=True)
-
+    last_crawl_at = DateTimeField(null=True)
+    last_crawl_success_at = DateTimeField(null=True)
+    last_status = IntegerField(null=True)
+    last_status_message = TextField(null=True)
 
 class Link(Model):
     """
@@ -66,19 +60,6 @@ class Link(Model):
     def get_is_cross_host_like(from_resource, to_resource):
         return from_resource.normalized_host_like != to_resource.normalized_host_like
 
-
-class Crawl(Model):
-    """
-    Attempts to crawl a page.
-    """
-
-    page = ForeignKeyField(Page, backref="crawls", on_delete="CASCADE")
-    status = IntegerField()
-    error_message = TextField(null=True)
-    is_different = BooleanField()
-    timestamp = DateTimeField()
-
-
 class Search(Model):
     """
     A log of performed searches
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -4,21 +4,16 @@ from datetime import datetime
 from peewee import fn, SQL
 
 from gus.excludes import EXCLUDED_URL_PREFIXES
-from gus.lib.db_model import Page, Crawl
+from gus.lib.db_model import Page
 
 
 def compute_index_statistics(db):
-    page_count = len(Page.raw("""SELECT p.id
-FROM indexable_crawl AS c
-JOIN page AS p
-ON p.id == c.page_id
+    page_count = len(Page.raw("""SELECT DISTINCT p.id
+FROM page AS p
 GROUP BY p.normalized_url""").dicts())
 
     domains_query = Page.raw("""SELECT DISTINCT p.domain, p.port 
-FROM indexable_crawl AS c
-JOIN page AS p 
-ON p.id == c.page_id
-""")
+FROM page AS p""")
     domains = []
     for d in domains_query.execute():
         s = d.domain
@@ -35,19 +30,16 @@ ON p.id == c.page_id
     domain_count = len(domains)
 
     content_type_frequencies = (Page.raw("""SELECT p.content_type, count(p.content_type) as 'count'
-FROM indexable_crawl AS c
-JOIN page AS p
-ON p.id == c.page_id
-GROUP BY p.content_type
+FROM  page AS p
+GROUP BY p.normalized_url, p.content_type
 ORDER BY 2 desc""").dicts())
     charset_frequencies = (Page.raw("""SELECT upper(p.charset), count(p.id) as 'count'
-FROM indexable_crawl AS c
-JOIN page AS p
-ON p.id == c.page_id
+FROM page AS p
 WHERE p.charset IS NOT NULL
 GROUP BY upper(p.charset)
 ORDER BY 2 desc""").dicts())
-    index_modification_time = Crawl.select(fn.MAX(Crawl.timestamp)).scalar()
+    #index_modification_time = datetime.fromisoformat(Page.select(fn.Max(last_crawl_at)).scalar())
+    index_modification_time = datetime.utcnow()
 
     return {
         "index_modification_time": index_modification_time,
diff --git a/serve/templates/news.gmi b/serve/templates/news.gmi
@@ -1,6 +1,13 @@
 {% include 'fragments/header.gmi' %}
 
+
 ## News
+### 2021-07-10
+If finally managed to analyze the index process. In the end it turned out to be an issue when calculating the backlink counters and with an adapted query indexing is fast again.
+Obviously i was horribly wrong all the time blaming the slow vps.
+
+Unfortunately this is only a small step in the major overhaul of GUS.
+
 ### 2021-07-04
 More trouble along the way. Although the VPS hosting geminispace.info runs with 8 Gigs of RAM and does not serve other services, the index update got oom-killed. :(
 Seems due to the continued growth of gemini we are hitting the same problems Natalie hit a few months ago on GUS. I'm currently unsure about the next steps.

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/build_index.py	\|	56	+++++++++++++++++++++++++++++++-------------------------
M	gus/crawl.py	\|	123	+++++++++++++++++++++----------------------------------------------------------
M	gus/lib/db_model.py	\|	29	+++++------------------------
M	gus/lib/index_statistics.py	\|	26	+++++++++-----------------
M	serve/templates/news.gmi	\|	7	+++++++