geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit fae9d9d5fe1bfb84af91c6a93743597ddd04042c
parent 0b45da52c1fc5ee927f2ea9e90570a8bd38fe3fc
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sun,  2 Aug 2020 09:46:53 -0400

[crawl] Improve handling of change_frequency

This change centralizes the logic into lib/gemini.py for a start.
Additionally it fixes a bug in that the crawl was incrementing the
change_frequency when the page *was* changed. And lastly, this now
adds some pikkulog detection, so those pages get crawled frequently as
well now (which will help them stay current in thread construction).

Diffstat:
Mgus/constants.py | 1+
Mgus/crawl.py | 44+++++++++++++++++++++++---------------------
Mgus/lib/gemini.py | 65+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
3 files changed, 87 insertions(+), 23 deletions(-)

diff --git a/gus/constants.py b/gus/constants.py @@ -11,5 +11,6 @@ DEFAULT_NON_ROOT_CHANGE_FREQUENCY = 24 * 7 DEFAULT_REDIRECT_CHANGE_FREQUENCY = 24 * 7 DEFAULT_BINARY_CHANGE_FREQUENCY = 24 * 30 DEFAULT_PROMPT_CHANGE_FREQUENCY = 24 * 30 +DEFAULT_PIKKULOG_CHANGE_FREQUENCY = 3 DEFAULT_TEMP_ERROR_CHANGE_FREQUENCY = 24 DEFAULT_PERM_ERROR_CHANGE_FREQUENCY = 24 * 30 * 3 diff --git a/gus/crawl.py b/gus/crawl.py @@ -53,6 +53,7 @@ EXCLUDED_URL_PREFIXES = [ "gemini://gus.guru/search?", "gemini://gus.guru/v/search?", "gemini://gus.guru/add-seed?", + "gemini://gus.guru/backlinks?", # Houston "gemini://houston.coder.town/search?", @@ -153,14 +154,13 @@ def index_binary(resource, response): "content_type": response.content_type, "charset": response.charset, "size": response.num_bytes, - "change_frequency": constants.DEFAULT_BINARY_CHANGE_FREQUENCY, + "change_frequency": resource.get_default_change_frequency("binary"), } existing_page = Page.get_or_none(url=resource.indexable_url) if existing_page: doc["id"] = existing_page.id - if existing_page.change_frequency: - doc["change_frequency"] = existing_page.change_frequency + 24 * 7 - doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"] + existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("binary") + doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "binary") page = Page(**doc) page.save() return page @@ -174,13 +174,13 @@ def index_redirect(resource): "normalized_url": resource.normalized_url, "domain": resource.normalized_host, "port": resource.urlsplit.port or 1965, - "change_frequency": constants.DEFAULT_REDIRECT_CHANGE_FREQUENCY, + "change_frequency": resource.get_default_change_frequency("redirect"), } existing_page = Page.get_or_none(url=resource.indexable_url) if existing_page: doc["id"] = existing_page.id - if existing_page.change_frequency: - doc["change_frequency"] = existing_page.change_frequency + 24 * 2 + existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("redirect") + doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "redirect") page = Page(**doc) page.save() return page @@ -188,22 +188,21 @@ def index_redirect(resource): def index_error(resource, is_temporary): print("INDEXING ERROR...") + category = "temp_error" if is_temporary else "perm_error" + default_change_frequency = resource.get_default_change_frequency(category) doc = { "url": resource.indexable_url, "fetchable_url": resource.fetchable_url, "normalized_url": resource.normalized_url, "domain": resource.normalized_host, "port": resource.urlsplit.port or 1965, - "change_frequency": constants.DEFAULT_TEMP_ERROR_CHANGE_FREQUENCY if is_temporary else constants.DEFAULT_PERM_ERROR_CHANGE_FREQUENCY, + "change_frequency": default_change_frequency, } existing_page = Page.get_or_none(url=resource.indexable_url) if existing_page: doc["id"] = existing_page.id - if existing_page.change_frequency: - if is_temporary: - doc["change_frequency"] = existing_page.change_frequency + 12 - else: - doc["change_frequency"] = existing_page.change_frequency + 24 * 30 + existing_change_frequency = existing_page.change_frequency or default_change_frequency + doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, category) page = Page(**doc) page.save() return page @@ -222,13 +221,13 @@ def index_prompt(resource, response): "charset": response.charset, "size": response.num_bytes, "prompt": response.prompt, - "change_frequency": constants.DEFAULT_PROMPT_CHANGE_FREQUENCY, + "change_frequency": resource.get_default_change_frequency("prompt"), } existing_page = Page.get_or_none(url=resource.indexable_url) if existing_page: doc["id"] = existing_page.id - if existing_page.change_frequency: - doc["change_frequency"] = existing_page.change_frequency + 24 * 7 + existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("prompt") + doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "prompt") page = Page(**doc) page.save() return page @@ -236,7 +235,6 @@ def index_prompt(resource, response): def index_content(resource, response): print("INDEXING CONTENT...") - change_frequency = constants.DEFAULT_ROOT_CHANGE_FREQUENCY if resource.is_root_like or resource.is_log_root_like else constants.DEFAULT_NON_ROOT_CHANGE_FREQUENCY doc = { "url": resource.indexable_url, "fetchable_url": resource.fetchable_url, @@ -247,7 +245,7 @@ def index_content(resource, response): "charset": response.charset, "content": response.content, "size": response.num_bytes, - "change_frequency": change_frequency, + "change_frequency": resource.get_default_change_frequency("content"), } if response.content_type == "text/gemini": doc["lang"] = response.lang or "none", @@ -255,9 +253,13 @@ def index_content(resource, response): is_different = False if existing_page: doc["id"] = existing_page.id - is_different = existing_page.content is not None and doc["content"] != existing_page.content - if existing_page.change_frequency and is_different and not (resource.is_root_like or resource.is_log_root_like): - doc["change_frequency"] = existing_page.change_frequency + 24 * 3 + if existing_page.content: + is_different = doc["content"] != existing_page.content + if is_different: + doc["change_frequency"] = resource.get_default_change_frequency("content") + else: + existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("content") + doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "content") page = Page(**doc) page.save() return page, is_different diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -4,6 +4,7 @@ from urllib.robotparser import RobotFileParser import gusmobile +from gus import constants from gus.lib.domain import is_domain # hack(natpen): the built-in methods in urllib need to know the @@ -20,6 +21,8 @@ LOG_POST_BOSTON_LIKE_PATTERN = re.compile("^/boston/\d{4}/\d{2}/\d+\.\d+", flags ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE) ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE) +PIKKULOG_LIKE_PATTERN = re.compile(".*/pikkulog/.*", flags=re.IGNORECASE) + AUTHOR_URL_PATTERN = re.compile("^/~([a-z][-a-z0-9]*)/|^/users/~?([a-z][-a-z0-9]*)", flags=re.IGNORECASE) AUTHOR_CONTENT_PATTERN = re.compile(".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE) @@ -65,6 +68,8 @@ class GeminiResource(): self._is_root_like = None self._is_log_root_like = None self._is_log_post_like = None + self._is_pikkulog_like = None + self._default_change_frequency = None self.contained_resources = None def urlsplit_featureful(url, fully_qualified_parent_url=None, parent_hostname=None): @@ -178,6 +183,16 @@ class GeminiResource(): return self._is_root_like + def _get_is_pikkulog_like(self): + if self._is_pikkulog_like is None: + is_pikkulog_like = False + pikkulog_match = PIKKULOG_LIKE_PATTERN.match(self.urlsplit.path) + if pikkulog_match: + is_pikkulog_like = True + self._is_pikkulog_like = is_pikkulog_like + return self._is_pikkulog_like + + def _get_is_log_root_like(self): if self._is_log_root_like is None: is_log_root_like = False @@ -194,6 +209,7 @@ class GeminiResource(): post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match(self.urlsplit.path) post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match(self.urlsplit.path) post_boston_match = LOG_POST_BOSTON_LIKE_PATTERN.match(self.urlsplit.path) + if (post_like_match and not post_like_exclusion_match) or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match) or (self.normalized_host == "gemini.conman.org" and post_boston_match): is_log_post_like = True self._is_log_post_like = is_log_post_like @@ -244,10 +260,54 @@ class GeminiResource(): return friendly_title - def get_log_post_date(self, content): - if not self.is_log_post_like: + def get_default_change_frequency(self, category): + if not self.is_valid: return None + if self._default_change_frequency is None: + if category == "content": + if self.is_root_like or self.is_log_root_like: + change_frequency = constants.DEFAULT_ROOT_CHANGE_FREQUENCY + elif self.is_pikkulog_like: + change_frequency = constants.DEFAULT_PIKKULOG_CHANGE_FREQUENCY + else: + change_frequency = constants.DEFAULT_NON_ROOT_CHANGE_FREQUENCY + elif category == "binary": + change_frequency = constants.DEFAULT_BINARY_CHANGE_FREQUENCY + elif category == "redirect": + change_frequency = constants.DEFAULT_REDIRECT_CHANGE_FREQUENCY + elif category == "temp_error": + change_frequency = constants.DEFAULT_TEMP_ERROR_CHANGE_FREQUENCY + elif category == "perm_error": + change_frequency = constants.DEFAULT_PERM_ERROR_CHANGE_FREQUENCY + elif category == "prompt": + change_frequency = constants.DEFAULT_PROMPT_CHANGE_FREQUENCY + else: + raise Exception.NameError("Unrecognized resource category") + + self._default_change_frequency = change_frequency + return self._default_change_frequency + + def increment_change_frequency(self, existing_change_frequency, category): + if category == "content": + if self.is_root_like or self.is_log_root_like: + return existing_change_frequency + elif self.is_pikkulog_like: + return existing_change_frequency + 6 + else: + return existing_change_frequency + 24 * 3 + elif category == "binary": + return existing_change_frequency + 24 * 7 + elif category == "redirect": + return existing_change_frequency + 24 * 2 + elif category == "temp_error": + return existing_change_frequency + 12 + elif category == "perm_error": + return existing_change_frequency + 24 * 30 + elif category == "prompt": + return existing_change_frequency + 24 * 7 + else: + raise Exception.NameError("Unrecognized resource category") # constructed from fetchable_url @@ -264,6 +324,7 @@ class GeminiResource(): is_root_like = property(_get_is_root_like) is_log_root_like = property(_get_is_log_root_like) is_log_post_like = property(_get_is_log_post_like) + is_pikkulog_like = property(_get_is_pikkulog_like) normalized_host_like = property(_get_normalized_host_like) def fetch(self):