more meta data for index cleanup - geminispace.info

commit fa2db540f6a05e5be6423a0e6c4d24950f8fbdf8
parent b484a4dadc7b58eac2f13188a748f64a83d58a26
Author: René Wagner <rwa@clttr.info>
Date:   Thu, 14 Oct 2021 18:54:12 +0200

more meta data for index cleanup

Diffstat:
M gus/build_index.py  | 33 ++++++---------------------------
M gus/constants.py  | 2 +-
M gus/crawl.py  | 18 +++++++++++-------
M gus/excludes.py  | 2 +-
M gus/lib/db_model.py  | 1 +
M gus/lib/index_statistics.py  | 8 ++++----

6 files changed, 24 insertions(+), 40 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -79,36 +79,23 @@ AND l.is_cross_host_like == 1""",
         return False
 
 
-def invalidate_recent_results(index, invalidation_window):
-    recency_minimum = datetime.now() - timedelta(hours=invalidation_window)
-    pages = Page.select().where(
-        Page.indexed_at.is_null(False), Page.indexed_at > recency_minimum
-    )
-    logging.debug('Invalidating %d pages %s', pages.count(), recency_minimum)
-    for page in pages:
-        index.delete_by_term("url_id", page.url)
-
-
-def build_index(should_run_destructive=False, invalidation_window=0):
+def build_index(should_run_destructive=False):
     index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR
-#    index_dir = constants.INDEX_DIR_NEW
 
     db = init_db(index_dir + "/gus.sqlite")
     index = search.Index(index_dir, should_run_destructive)
 
-    invalidate_recent_results(index, invalidation_window)
-
     if (should_run_destructive):
         pages = Page.raw(
         """SELECT p.* FROM page AS p
-WHERE p.last_status == 20 
+WHERE p.last_success_status == 20 
 AND (p.content_type NOT LIKE 'text/%'
 OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE
     )
     else:
         pages = Page.raw(
         """SELECT p.* FROM page AS p
-WHERE p.last_status == 20 
+WHERE p.last_success_status == 20 
 AND (p.indexed_at IS NULL OR 
 p.indexed_at < p.last_crawl_success_at)
 AND (p.content_type NOT LIKE 'text/%'
@@ -126,14 +113,14 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA
         if (should_run_destructive):
             pages = Page.raw(
         """SELECT p.* FROM page AS p
-WHERE p.last_status == 20 
+WHERE p.last_success_status == 20 
 AND (p.content_type NOT LIKE 'text/%'
 OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE
             )
         else:
             pages = Page.raw(
         """SELECT p.* FROM page AS p
-WHERE p.last_status == 20 
+WHERE p.last_success_status == 20 
 AND (p.indexed_at IS NULL OR 
 p.indexed_at < p.last_crawl_success_at)
 AND (p.content_type NOT LIKE 'text/%'
@@ -159,7 +146,7 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA
 def main():
     args = parse_args()
     gus.lib.logging.handle_arguments(args)
-    build_index(args.should_run_destructive, args.invalidation_window)
+    build_index(args.should_run_destructive)
 
 
 def parse_args():
@@ -172,14 +159,6 @@ def parse_args():
         default=False,
         help="create a fresh index",
     )
-    parser.add_argument(
-        "--invalidation_window",
-        "-i",
-        dest="invalidation_window",
-        type=int,
-        default=0,
-        help="a recency window, in hours, for recently crawled pages that should be forcefully reindexed",
-    )
     gus.lib.logging.add_arguments(parser)
     args = parser.parse_args()
     return args
diff --git a/gus/constants.py b/gus/constants.py
@@ -6,7 +6,7 @@ DB_FILENAME = "gus.sqlite"
 FEED_FILE = "feeds.txt"
 MAXIMUM_REDIRECT_CHAIN_LENGTH = 5
 MAXIMUM_FAILED_REQUEST_COUNT = 5
-MAXIMUM_TEXT_PAGE_SIZE = 5120000 # 1000KB, in bytes
+MAXIMUM_TEXT_PAGE_SIZE = 5120000 # in bytes
 
 # default change frequencies (in hours)
 ROOT_CHANGE_FREQUENCY_DEFAULT = 24
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -49,7 +49,8 @@ def index_binary(resource, response):
         "last_crawl_at": datetime.utcnow(),
         "last_crawl_success_at": datetime.utcnow(),
         "last_status" : response.status,
-        "last_stats_message" : response.error_message,
+        "last_success_status": response.status,
+        "last_status_message" : response.error_message,
         "first_seen_at" : datetime.utcnow()
     }
     existing_page = Page.get_or_none(url=resource.normalized_url)
@@ -89,7 +90,8 @@ def index_redirect(resource, response):
         "last_crawl_at": datetime.utcnow(),
         "last_crawl_success_at": datetime.utcnow(),
         "last_status" : response.status,
-        "last_stats_message" : response.error_message,
+        "last_success_status" : response.status,
+        "last_status_message" : response.error_message,
         "first_seen_at" : datetime.utcnow()
     }
     existing_page = Page.get_or_none(url=resource.normalized_url)
@@ -164,7 +166,8 @@ def index_prompt(resource, response):
         "last_crawl_at": datetime.utcnow(),
         "last_crawl_success_at": datetime.utcnow(),
         "last_status" : response.status,
-        "last_stats_message" : response.error_message,
+        "last_success_status" : response.status,
+        "last_status_message" : response.error_message,
         "first_seen_at" : datetime.utcnow()
     }
     existing_page = Page.get_or_none(url=resource.normalized_url)
@@ -208,7 +211,8 @@ def index_content(resource, response):
         "last_crawl_at": datetime.utcnow(),
         "last_crawl_success_at": datetime.utcnow(),
         "last_status" : response.status,
-        "last_stats_message" : response.error_message,
+        "last_success_status" : response.status,
+        "last_status_message" : response.error_message,
         "first_seen_at" : datetime.utcnow()
     }
     if response.content_type == "text/gemini":
@@ -257,10 +261,10 @@ def should_skip(resource):
         if m:
             should_skip = True
     except:
-        logging.error("Error checking for exclude of url: %s", gus.lib.logging.strip_control_chars(resource.normalized_url))
-		should_skip = True
+        logging.error("Error checking for exclude of url: %s", gus.lib.logging.strip_control_chars(resource.raw_url))
+        should_skip = True
 
-	return should_skip
+    return should_skip
 
 
 def index_links(from_resource, contained_resources):
diff --git a/gus/excludes.py b/gus/excludes.py
@@ -2,7 +2,6 @@
 # prepended with the gemini:// protocol, be all lowercased, and
 # not have the port specified if it is 1965.
 EXCLUDED_URL_PREFIXES = [
-	# test and other invalid URIs
     "gemini://localhost",
     "gemini://example.org",
     "gemini://example.com",
@@ -136,6 +135,7 @@ EXCLUDED_URL_PREFIXES = [
     "gemini://gemini.susa.net/cgi-bin/search?",
     "gemini://gemini.susa.net/cgi-bin/twitter?",
     "gemini://gemini.susa.net/cgi-bin/vim-search?",
+    "gemini://gemini.susa.net/cgi-bin/links_stu.lua?",
 
     "gemini://gemini.spam.works/textfiles/",
     "gemini://gemini.spam.works/mirrors/textfiles/",
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -46,6 +46,7 @@ class Page(Model):
     last_crawl_success_at = DateTimeField(null=True)
     last_status = IntegerField(null=True)
     last_status_message = TextField(null=True)
+    last_success_status = IntegerField(null=True)
     first_seen_at = DateTimeField(null=True)
 
 class Link(Model):
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -10,11 +10,11 @@ from gus.lib.db_model import Page
 def compute_index_statistics(db):
     page_count = len(Page.raw("""SELECT DISTINCT p.id
 FROM page AS p
-WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""").dicts())
+WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL""").dicts())
 
     domains_query = Page.raw("""SELECT DISTINCT p.domain, p.port 
 FROM page AS p 
-WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""")
+WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL""")
     domains = []
     for d in domains_query.execute():
         s = d.domain
@@ -32,12 +32,12 @@ WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""")
 
     content_type_frequencies = (Page.raw("""SELECT p.content_type, count(p.content_type) as 'count'
 FROM  page AS p
-WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL
+WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL
 GROUP BY p.content_type
 ORDER BY 2 desc""").dicts())
     charset_frequencies = (Page.raw("""SELECT upper(p.charset), count(p.id) as 'count'
 FROM page AS p
-WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL AND  p.charset IS NOT NULL
+WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL AND  p.charset IS NOT NULL
 GROUP BY upper(p.charset)
 ORDER BY 2 desc""").dicts())
     index_modification_time = Page.select(fn.Max(Page.last_crawl_at)).scalar()

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/build_index.py	\|	33	++++++---------------------------
M	gus/constants.py	\|	2	+-
M	gus/crawl.py	\|	18	+++++++++++-------
M	gus/excludes.py	\|	2	+-
M	gus/lib/db_model.py	\|	1	+
M	gus/lib/index_statistics.py	\|	8	++++----