geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit fa2db540f6a05e5be6423a0e6c4d24950f8fbdf8
parent b484a4dadc7b58eac2f13188a748f64a83d58a26
Author: René Wagner <rwa@clttr.info>
Date:   Thu, 14 Oct 2021 18:54:12 +0200

more meta data for index cleanup

Diffstat:
Mgus/build_index.py | 33++++++---------------------------
Mgus/constants.py | 2+-
Mgus/crawl.py | 18+++++++++++-------
Mgus/excludes.py | 2+-
Mgus/lib/db_model.py | 1+
Mgus/lib/index_statistics.py | 8++++----
6 files changed, 24 insertions(+), 40 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -79,36 +79,23 @@ AND l.is_cross_host_like == 1""", return False -def invalidate_recent_results(index, invalidation_window): - recency_minimum = datetime.now() - timedelta(hours=invalidation_window) - pages = Page.select().where( - Page.indexed_at.is_null(False), Page.indexed_at > recency_minimum - ) - logging.debug('Invalidating %d pages %s', pages.count(), recency_minimum) - for page in pages: - index.delete_by_term("url_id", page.url) - - -def build_index(should_run_destructive=False, invalidation_window=0): +def build_index(should_run_destructive=False): index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR -# index_dir = constants.INDEX_DIR_NEW db = init_db(index_dir + "/gus.sqlite") index = search.Index(index_dir, should_run_destructive) - invalidate_recent_results(index, invalidation_window) - if (should_run_destructive): pages = Page.raw( """SELECT p.* FROM page AS p -WHERE p.last_status == 20 +WHERE p.last_success_status == 20 AND (p.content_type NOT LIKE 'text/%' OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE ) else: pages = Page.raw( """SELECT p.* FROM page AS p -WHERE p.last_status == 20 +WHERE p.last_success_status == 20 AND (p.indexed_at IS NULL OR p.indexed_at < p.last_crawl_success_at) AND (p.content_type NOT LIKE 'text/%' @@ -126,14 +113,14 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA if (should_run_destructive): pages = Page.raw( """SELECT p.* FROM page AS p -WHERE p.last_status == 20 +WHERE p.last_success_status == 20 AND (p.content_type NOT LIKE 'text/%' OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE ) else: pages = Page.raw( """SELECT p.* FROM page AS p -WHERE p.last_status == 20 +WHERE p.last_success_status == 20 AND (p.indexed_at IS NULL OR p.indexed_at < p.last_crawl_success_at) AND (p.content_type NOT LIKE 'text/%' @@ -159,7 +146,7 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA def main(): args = parse_args() gus.lib.logging.handle_arguments(args) - build_index(args.should_run_destructive, args.invalidation_window) + build_index(args.should_run_destructive) def parse_args(): @@ -172,14 +159,6 @@ def parse_args(): default=False, help="create a fresh index", ) - parser.add_argument( - "--invalidation_window", - "-i", - dest="invalidation_window", - type=int, - default=0, - help="a recency window, in hours, for recently crawled pages that should be forcefully reindexed", - ) gus.lib.logging.add_arguments(parser) args = parser.parse_args() return args diff --git a/gus/constants.py b/gus/constants.py @@ -6,7 +6,7 @@ DB_FILENAME = "gus.sqlite" FEED_FILE = "feeds.txt" MAXIMUM_REDIRECT_CHAIN_LENGTH = 5 MAXIMUM_FAILED_REQUEST_COUNT = 5 -MAXIMUM_TEXT_PAGE_SIZE = 5120000 # 1000KB, in bytes +MAXIMUM_TEXT_PAGE_SIZE = 5120000 # in bytes # default change frequencies (in hours) ROOT_CHANGE_FREQUENCY_DEFAULT = 24 diff --git a/gus/crawl.py b/gus/crawl.py @@ -49,7 +49,8 @@ def index_binary(resource, response): "last_crawl_at": datetime.utcnow(), "last_crawl_success_at": datetime.utcnow(), "last_status" : response.status, - "last_stats_message" : response.error_message, + "last_success_status": response.status, + "last_status_message" : response.error_message, "first_seen_at" : datetime.utcnow() } existing_page = Page.get_or_none(url=resource.normalized_url) @@ -89,7 +90,8 @@ def index_redirect(resource, response): "last_crawl_at": datetime.utcnow(), "last_crawl_success_at": datetime.utcnow(), "last_status" : response.status, - "last_stats_message" : response.error_message, + "last_success_status" : response.status, + "last_status_message" : response.error_message, "first_seen_at" : datetime.utcnow() } existing_page = Page.get_or_none(url=resource.normalized_url) @@ -164,7 +166,8 @@ def index_prompt(resource, response): "last_crawl_at": datetime.utcnow(), "last_crawl_success_at": datetime.utcnow(), "last_status" : response.status, - "last_stats_message" : response.error_message, + "last_success_status" : response.status, + "last_status_message" : response.error_message, "first_seen_at" : datetime.utcnow() } existing_page = Page.get_or_none(url=resource.normalized_url) @@ -208,7 +211,8 @@ def index_content(resource, response): "last_crawl_at": datetime.utcnow(), "last_crawl_success_at": datetime.utcnow(), "last_status" : response.status, - "last_stats_message" : response.error_message, + "last_success_status" : response.status, + "last_status_message" : response.error_message, "first_seen_at" : datetime.utcnow() } if response.content_type == "text/gemini": @@ -257,10 +261,10 @@ def should_skip(resource): if m: should_skip = True except: - logging.error("Error checking for exclude of url: %s", gus.lib.logging.strip_control_chars(resource.normalized_url)) - should_skip = True + logging.error("Error checking for exclude of url: %s", gus.lib.logging.strip_control_chars(resource.raw_url)) + should_skip = True - return should_skip + return should_skip def index_links(from_resource, contained_resources): diff --git a/gus/excludes.py b/gus/excludes.py @@ -2,7 +2,6 @@ # prepended with the gemini:// protocol, be all lowercased, and # not have the port specified if it is 1965. EXCLUDED_URL_PREFIXES = [ - # test and other invalid URIs "gemini://localhost", "gemini://example.org", "gemini://example.com", @@ -136,6 +135,7 @@ EXCLUDED_URL_PREFIXES = [ "gemini://gemini.susa.net/cgi-bin/search?", "gemini://gemini.susa.net/cgi-bin/twitter?", "gemini://gemini.susa.net/cgi-bin/vim-search?", + "gemini://gemini.susa.net/cgi-bin/links_stu.lua?", "gemini://gemini.spam.works/textfiles/", "gemini://gemini.spam.works/mirrors/textfiles/", diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py @@ -46,6 +46,7 @@ class Page(Model): last_crawl_success_at = DateTimeField(null=True) last_status = IntegerField(null=True) last_status_message = TextField(null=True) + last_success_status = IntegerField(null=True) first_seen_at = DateTimeField(null=True) class Link(Model): diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -10,11 +10,11 @@ from gus.lib.db_model import Page def compute_index_statistics(db): page_count = len(Page.raw("""SELECT DISTINCT p.id FROM page AS p -WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""").dicts()) +WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL""").dicts()) domains_query = Page.raw("""SELECT DISTINCT p.domain, p.port FROM page AS p -WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""") +WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL""") domains = [] for d in domains_query.execute(): s = d.domain @@ -32,12 +32,12 @@ WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""") content_type_frequencies = (Page.raw("""SELECT p.content_type, count(p.content_type) as 'count' FROM page AS p -WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL +WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL GROUP BY p.content_type ORDER BY 2 desc""").dicts()) charset_frequencies = (Page.raw("""SELECT upper(p.charset), count(p.id) as 'count' FROM page AS p -WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL AND p.charset IS NOT NULL +WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL AND p.charset IS NOT NULL GROUP BY upper(p.charset) ORDER BY 2 desc""").dicts()) index_modification_time = Page.select(fn.Max(Page.last_crawl_at)).scalar()