geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 80e589b1d481bb85344c9341bb61d690e58200de
parent cddbb82dfd62c8e7e2601fd53b46359cdca1bb06
Author: René Wagner <rwa@clttr.info>
Date:   Mon, 12 Jul 2021 19:27:57 +0200

commit search index only when indexing is complete

unnecessary commits during indexing are time-consuming
remove dead "feedparser" code from crawl

Diffstat:
Mgus/build_index.py | 27++++++++++++++++++++++++---
Mgus/crawl.py | 114+------------------------------------------------------------------------------
Mgus/excludes.py | 2+-
Mgus/lib/search.py | 7-------
4 files changed, 26 insertions(+), 124 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -67,9 +67,10 @@ AND l.is_cross_host_like == 1""", try: logging.debug("Adding document to index: %s", page.url); index.add_document(document) - page.indexed_at=datetime.utcnow() - page.save() - +# logging.debug("Updating ge in sqlite store: %s", page.url) +# page.indexed_at=datetime.utcnow() +# page.save() + logging.debug("Document done") return True except Exception as e: logging.exception( @@ -120,10 +121,30 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA index_page(index, page) try: + logging.info("Commiting search index...") index.close() + logging.info("Updating raw data...") + if (should_run_destructive): + pages = Page.raw( + """UPDATE page SET indexed_at = ? +WHERE last_status == 20 +AND (content_type NOT LIKE 'text/%' +OR (content_type LIKE 'text/%' AND size <= ?))""", +datetime.utcnow(), constants.MAXIMUM_TEXT_PAGE_SIZE) + else: + pages = Page.raw( + """UPDATE page SET indexed_at = ? +WHERE last_status == 20 +AND (indexed_at IS NULL OR +indexed_at < last_crawl_success_at) +AND (content_type NOT LIKE 'text/%' +OR (content_type LIKE 'text/%' AND size <= ?))""", +datetime.utcnow(), constants.MAXIMUM_TEXT_PAGE_SIZE) + except Exception as e: logging.error('Closing of index failed: %s', e); + logging.debug("Updating statistics...") index_statistics = compute_index_statistics(db) log_index_statistics(index_statistics) persist_statistics(index_statistics, None, should_run_destructive, "statistics.csv") diff --git a/gus/crawl.py b/gus/crawl.py @@ -8,13 +8,6 @@ import pathlib import time from urllib.parse import urljoin, uses_relative, uses_netloc -# TODO: this import breaks with Python 3.9, but all code that relies -# on this code path is currently dead code, so for now I'm just -# commenting out the import. It would be nice to make an actual -# decision soon about whether or not feed-based updating is even -# worth keeping around. If not, the dead code paths could simply -# deleted, and GUS would get a bit simpler :) -# import feedparser import peewee from gus.excludes import EXCLUDED_URL_PREFIXES, EXCLUDED_URL_PATHS @@ -510,99 +503,6 @@ def load_seed_request_urls(): content = [x.strip() for x in content] return content - -def load_feed_urls(filename): - feeds = [] - with open(filename, "r") as fp: - for line in fp: - line = line.strip() - if not line or line.startswith("#"): - continue - feeds.append(line) - return feeds - - -def items_from_feed_string(feed_str): - feed_obj = feedparser.parse(feed_str) - feed = feed_obj.feed - return [ - (entry.updated_parsed, entry.link, entry.title, feed.title) - for entry in feed_obj.entries - ] - - -def resolve_feed_content_urls(feed_file=constants.FEED_FILE): - # Load feed URLs to query - feed_urls = load_feed_urls(feed_file) - N = len(feed_urls) - - # Prepare to extract feed items - last_accessed = {} - skips = 0 - items = [] - while feed_urls: - # Get a feed URL to fetch - feed_url = feed_urls.pop() - feed_resource = GeminiResource(feed_url) - - # Don't hammer servers - last = last_accessed.get(feed_resource.normalized_host, 0) - now = time.time() - interval = int(now - last) - if interval < 5: - logging.warn( - "Declining to hit %s again after only %d seconds", - gus.lib.logging.strip_control_chars(feed_resource.normalized_host), - interval, - ) - feed_urls.insert(0, feed_url) - skips += 1 - if skips == len(feed_urls): - # We've hammered every server in the queue! Sleep a bit... - logging.warn("Sleeping to give all servers a rest!") - time.sleep(5) - continue - skips = 0 - - # Good to go - logging.info("Fetching feed: %s", gus.lib.logging.strip_control_chars(feed_url)) - try: - resp = feed_resource.fetch() - except: - logging.info( - "Error fetching feed, skipping: %s", - gus.lib.logging.strip_control_chars(feed_url), - ) - continue - if resp and resp.status == "20": - last_accessed[feed_resource.normalized_host] = time.time() - items.extend(items_from_feed_string(resp.content)) - return [item[1] for item in items] - - -def recrawl_feeds(): - content_urls = resolve_feed_content_urls() - global index_dir - index_dir = constants.INDEX_DIR - global db - db = init_db(f"{index_dir}/{constants.DB_FILENAME}") - global max_crawl_depth - max_crawl_depth = 0 - global robot_file_map - robot_file_map = {} - global domain_hit_timings - domain_hit_timings = {} - - seed_resources = [GeminiResource(url) for url in content_urls] - for resource in seed_resources: - crawl_page(resource, 0) - - logging.debug( - "Recrawled feeds: %s", gus.lib.logging.strip_control_chars(content_urls) - ) - logging.info("Finished!") - - def run_crawl(should_run_destructive=False, seed_urls=[]): global index_dir index_dir = constants.INDEX_DIR if should_run_destructive else constants.INDEX_DIR @@ -632,10 +532,7 @@ def main(): args = parse_args() gus.lib.logging.handle_arguments(args) - if args.should_recrawl_feeds: - recrawl_feeds() - else: - run_crawl(args.should_run_destructive, seed_urls=args.seed_urls) + run_crawl(args.should_run_destructive, seed_urls=args.seed_urls) def parse_args(): @@ -649,14 +546,6 @@ def parse_args(): help="create a fresh index and perform a full Geminispace crawl", ) parser.add_argument( - "--feeds", - "-f", - dest="should_recrawl_feeds", - action="store_true", - default=False, - help="recrawl known atom feeds", - ) - parser.add_argument( "--seeds", "-s", metavar="URL", @@ -669,6 +558,5 @@ def parse_args(): args = parser.parse_args() return args - if __name__ == "__main__": main() diff --git a/gus/excludes.py b/gus/excludes.py @@ -140,7 +140,7 @@ EXCLUDED_URL_PREFIXES = [ "gemini://gemini.cabestan.tk/hn", "gemini://hn.filiuspatris.net/", "gemini://schmittstefan.de/de/nachrichten/", - + "gemini://gmi.noulin.net/mobile" # wikipedia proxy "gemini://wp.pitr.ca/", "gemini://wp.glv.one/", diff --git a/gus/lib/search.py b/gus/lib/search.py @@ -41,7 +41,6 @@ class Index: order=whoosh.highlight.SCORE, ) - self._write_counter = 0 self._writer = None def _create(self, index_storage): @@ -76,12 +75,6 @@ class Index: self._index.close() def _rolling_writer(self): - self._write_counter += 1 - if self._writer and self._write_counter % 5000 == 0: - logging.debug("Committing index.") - self._writer.commit() - self._writer = None - if not self._writer: self._writer = self._index.writer()