commit search index only when indexing is complete - geminispace.info

commit 80e589b1d481bb85344c9341bb61d690e58200de
parent cddbb82dfd62c8e7e2601fd53b46359cdca1bb06
Author: René Wagner <rwa@clttr.info>
Date:   Mon, 12 Jul 2021 19:27:57 +0200

commit search index only when indexing is complete

unnecessary commits during indexing are time-consuming
remove dead "feedparser" code from crawl

Diffstat:
M gus/build_index.py  | 27 ++++++++++++++++++++++++---
M gus/crawl.py  | 114 +------------------------------------------------------------------------------
M gus/excludes.py  | 2 +-
M gus/lib/search.py  | 7 -------

4 files changed, 26 insertions(+), 124 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -67,9 +67,10 @@ AND l.is_cross_host_like == 1""",
     try:
         logging.debug("Adding document to index: %s", page.url);
         index.add_document(document)
-        page.indexed_at=datetime.utcnow()
-        page.save()
-        
+#        logging.debug("Updating ge in sqlite store: %s", page.url)
+#        page.indexed_at=datetime.utcnow()
+#        page.save()
+        logging.debug("Document done") 
         return True
     except Exception as e:
         logging.exception(
@@ -120,10 +121,30 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA
         index_page(index, page)
 
     try:
+        logging.info("Commiting search index...")
         index.close()
+        logging.info("Updating raw data...")
+        if (should_run_destructive):
+            pages = Page.raw(
+        """UPDATE page SET indexed_at = ?
+WHERE last_status == 20 
+AND (content_type NOT LIKE 'text/%'
+OR (content_type LIKE 'text/%' AND size <= ?))""", 
+datetime.utcnow(), constants.MAXIMUM_TEXT_PAGE_SIZE)
+        else:
+            pages = Page.raw(
+        """UPDATE page SET indexed_at = ?
+WHERE last_status == 20 
+AND (indexed_at IS NULL OR 
+indexed_at < last_crawl_success_at)
+AND (content_type NOT LIKE 'text/%'
+OR (content_type LIKE 'text/%' AND size <= ?))""", 
+datetime.utcnow(), constants.MAXIMUM_TEXT_PAGE_SIZE)
+
     except Exception as e:
         logging.error('Closing of index failed: %s', e);
  
+    logging.debug("Updating statistics...")
     index_statistics = compute_index_statistics(db)
     log_index_statistics(index_statistics)
     persist_statistics(index_statistics, None, should_run_destructive, "statistics.csv")
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -8,13 +8,6 @@ import pathlib
 import time
 from urllib.parse import urljoin, uses_relative, uses_netloc
 
-# TODO: this import breaks with Python 3.9, but all code that relies
-# on this code path is currently dead code, so for now I'm just
-# commenting out the import. It would be nice to make an actual
-# decision soon about whether or not feed-based updating is even
-# worth keeping around. If not, the dead code paths could simply
-# deleted, and GUS would get a bit simpler :)
-# import feedparser
 import peewee
 
 from gus.excludes import EXCLUDED_URL_PREFIXES, EXCLUDED_URL_PATHS
@@ -510,99 +503,6 @@ def load_seed_request_urls():
     content = [x.strip() for x in content]
     return content
 
-
-def load_feed_urls(filename):
-    feeds = []
-    with open(filename, "r") as fp:
-        for line in fp:
-            line = line.strip()
-            if not line or line.startswith("#"):
-                continue
-            feeds.append(line)
-    return feeds
-
-
-def items_from_feed_string(feed_str):
-    feed_obj = feedparser.parse(feed_str)
-    feed = feed_obj.feed
-    return [
-        (entry.updated_parsed, entry.link, entry.title, feed.title)
-        for entry in feed_obj.entries
-    ]
-
-
-def resolve_feed_content_urls(feed_file=constants.FEED_FILE):
-    # Load feed URLs to query
-    feed_urls = load_feed_urls(feed_file)
-    N = len(feed_urls)
-
-    # Prepare to extract feed items
-    last_accessed = {}
-    skips = 0
-    items = []
-    while feed_urls:
-        # Get a feed URL to fetch
-        feed_url = feed_urls.pop()
-        feed_resource = GeminiResource(feed_url)
-
-        # Don't hammer servers
-        last = last_accessed.get(feed_resource.normalized_host, 0)
-        now = time.time()
-        interval = int(now - last)
-        if interval < 5:
-            logging.warn(
-                "Declining to hit %s again after only %d seconds",
-                gus.lib.logging.strip_control_chars(feed_resource.normalized_host),
-                interval,
-            )
-            feed_urls.insert(0, feed_url)
-            skips += 1
-            if skips == len(feed_urls):
-                # We've hammered every server in the queue!  Sleep a bit...
-                logging.warn("Sleeping to give all servers a rest!")
-                time.sleep(5)
-            continue
-        skips = 0
-
-        # Good to go
-        logging.info("Fetching feed: %s", gus.lib.logging.strip_control_chars(feed_url))
-        try:
-            resp = feed_resource.fetch()
-        except:
-            logging.info(
-                "Error fetching feed, skipping: %s",
-                gus.lib.logging.strip_control_chars(feed_url),
-            )
-            continue
-        if resp and resp.status == "20":
-            last_accessed[feed_resource.normalized_host] = time.time()
-            items.extend(items_from_feed_string(resp.content))
-    return [item[1] for item in items]
-
-
-def recrawl_feeds():
-    content_urls = resolve_feed_content_urls()
-    global index_dir
-    index_dir = constants.INDEX_DIR
-    global db
-    db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
-    global max_crawl_depth
-    max_crawl_depth = 0
-    global robot_file_map
-    robot_file_map = {}
-    global domain_hit_timings
-    domain_hit_timings = {}
-
-    seed_resources = [GeminiResource(url) for url in content_urls]
-    for resource in seed_resources:
-        crawl_page(resource, 0)
-
-    logging.debug(
-        "Recrawled feeds: %s", gus.lib.logging.strip_control_chars(content_urls)
-    )
-    logging.info("Finished!")
-
-
 def run_crawl(should_run_destructive=False, seed_urls=[]):
     global index_dir
     index_dir = constants.INDEX_DIR if should_run_destructive else constants.INDEX_DIR
@@ -632,10 +532,7 @@ def main():
     args = parse_args()
     gus.lib.logging.handle_arguments(args)
 
-    if args.should_recrawl_feeds:
-        recrawl_feeds()
-    else:
-        run_crawl(args.should_run_destructive, seed_urls=args.seed_urls)
+    run_crawl(args.should_run_destructive, seed_urls=args.seed_urls)
 
 
 def parse_args():
@@ -649,14 +546,6 @@ def parse_args():
         help="create a fresh index and perform a full Geminispace crawl",
     )
     parser.add_argument(
-        "--feeds",
-        "-f",
-        dest="should_recrawl_feeds",
-        action="store_true",
-        default=False,
-        help="recrawl known atom feeds",
-    )
-    parser.add_argument(
         "--seeds",
         "-s",
         metavar="URL",
@@ -669,6 +558,5 @@ def parse_args():
     args = parser.parse_args()
     return args
 
-
 if __name__ == "__main__":
     main()
diff --git a/gus/excludes.py b/gus/excludes.py
@@ -140,7 +140,7 @@ EXCLUDED_URL_PREFIXES = [
     "gemini://gemini.cabestan.tk/hn",
     "gemini://hn.filiuspatris.net/",
     "gemini://schmittstefan.de/de/nachrichten/",
-
+    "gemini://gmi.noulin.net/mobile"
     # wikipedia proxy
     "gemini://wp.pitr.ca/",
     "gemini://wp.glv.one/",
diff --git a/gus/lib/search.py b/gus/lib/search.py
@@ -41,7 +41,6 @@ class Index:
             order=whoosh.highlight.SCORE,
         )
 
-        self._write_counter = 0
         self._writer = None
 
     def _create(self, index_storage):
@@ -76,12 +75,6 @@ class Index:
         self._index.close()
 
     def _rolling_writer(self):
-        self._write_counter += 1
-        if self._writer and self._write_counter % 5000 == 0:
-            logging.debug("Committing index.")
-            self._writer.commit()
-            self._writer = None
-
         if not self._writer:
             self._writer = self._index.writer()

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/build_index.py	\|	27	++++++++++++++++++++++++---
M	gus/crawl.py	\|	114	+------------------------------------------------------------------------------
M	gus/excludes.py	\|	2	+-
M	gus/lib/search.py	\|	7	-------