commit fae9d9d5fe1bfb84af91c6a93743597ddd04042c
parent 0b45da52c1fc5ee927f2ea9e90570a8bd38fe3fc
Author: Natalie Pendragon <natpen@natpen.net>
Date: Sun, 2 Aug 2020 09:46:53 -0400
[crawl] Improve handling of change_frequency
This change centralizes the logic into lib/gemini.py for a start.
Additionally it fixes a bug in that the crawl was incrementing the
change_frequency when the page *was* changed. And lastly, this now
adds some pikkulog detection, so those pages get crawled frequently as
well now (which will help them stay current in thread construction).
Diffstat:
3 files changed, 87 insertions(+), 23 deletions(-)
diff --git a/gus/constants.py b/gus/constants.py
@@ -11,5 +11,6 @@ DEFAULT_NON_ROOT_CHANGE_FREQUENCY = 24 * 7
DEFAULT_REDIRECT_CHANGE_FREQUENCY = 24 * 7
DEFAULT_BINARY_CHANGE_FREQUENCY = 24 * 30
DEFAULT_PROMPT_CHANGE_FREQUENCY = 24 * 30
+DEFAULT_PIKKULOG_CHANGE_FREQUENCY = 3
DEFAULT_TEMP_ERROR_CHANGE_FREQUENCY = 24
DEFAULT_PERM_ERROR_CHANGE_FREQUENCY = 24 * 30 * 3
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -53,6 +53,7 @@ EXCLUDED_URL_PREFIXES = [
"gemini://gus.guru/search?",
"gemini://gus.guru/v/search?",
"gemini://gus.guru/add-seed?",
+ "gemini://gus.guru/backlinks?",
# Houston
"gemini://houston.coder.town/search?",
@@ -153,14 +154,13 @@ def index_binary(resource, response):
"content_type": response.content_type,
"charset": response.charset,
"size": response.num_bytes,
- "change_frequency": constants.DEFAULT_BINARY_CHANGE_FREQUENCY,
+ "change_frequency": resource.get_default_change_frequency("binary"),
}
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
- if existing_page.change_frequency:
- doc["change_frequency"] = existing_page.change_frequency + 24 * 7
- doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"]
+ existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("binary")
+ doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "binary")
page = Page(**doc)
page.save()
return page
@@ -174,13 +174,13 @@ def index_redirect(resource):
"normalized_url": resource.normalized_url,
"domain": resource.normalized_host,
"port": resource.urlsplit.port or 1965,
- "change_frequency": constants.DEFAULT_REDIRECT_CHANGE_FREQUENCY,
+ "change_frequency": resource.get_default_change_frequency("redirect"),
}
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
- if existing_page.change_frequency:
- doc["change_frequency"] = existing_page.change_frequency + 24 * 2
+ existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("redirect")
+ doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "redirect")
page = Page(**doc)
page.save()
return page
@@ -188,22 +188,21 @@ def index_redirect(resource):
def index_error(resource, is_temporary):
print("INDEXING ERROR...")
+ category = "temp_error" if is_temporary else "perm_error"
+ default_change_frequency = resource.get_default_change_frequency(category)
doc = {
"url": resource.indexable_url,
"fetchable_url": resource.fetchable_url,
"normalized_url": resource.normalized_url,
"domain": resource.normalized_host,
"port": resource.urlsplit.port or 1965,
- "change_frequency": constants.DEFAULT_TEMP_ERROR_CHANGE_FREQUENCY if is_temporary else constants.DEFAULT_PERM_ERROR_CHANGE_FREQUENCY,
+ "change_frequency": default_change_frequency,
}
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
- if existing_page.change_frequency:
- if is_temporary:
- doc["change_frequency"] = existing_page.change_frequency + 12
- else:
- doc["change_frequency"] = existing_page.change_frequency + 24 * 30
+ existing_change_frequency = existing_page.change_frequency or default_change_frequency
+ doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, category)
page = Page(**doc)
page.save()
return page
@@ -222,13 +221,13 @@ def index_prompt(resource, response):
"charset": response.charset,
"size": response.num_bytes,
"prompt": response.prompt,
- "change_frequency": constants.DEFAULT_PROMPT_CHANGE_FREQUENCY,
+ "change_frequency": resource.get_default_change_frequency("prompt"),
}
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
- if existing_page.change_frequency:
- doc["change_frequency"] = existing_page.change_frequency + 24 * 7
+ existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("prompt")
+ doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "prompt")
page = Page(**doc)
page.save()
return page
@@ -236,7 +235,6 @@ def index_prompt(resource, response):
def index_content(resource, response):
print("INDEXING CONTENT...")
- change_frequency = constants.DEFAULT_ROOT_CHANGE_FREQUENCY if resource.is_root_like or resource.is_log_root_like else constants.DEFAULT_NON_ROOT_CHANGE_FREQUENCY
doc = {
"url": resource.indexable_url,
"fetchable_url": resource.fetchable_url,
@@ -247,7 +245,7 @@ def index_content(resource, response):
"charset": response.charset,
"content": response.content,
"size": response.num_bytes,
- "change_frequency": change_frequency,
+ "change_frequency": resource.get_default_change_frequency("content"),
}
if response.content_type == "text/gemini":
doc["lang"] = response.lang or "none",
@@ -255,9 +253,13 @@ def index_content(resource, response):
is_different = False
if existing_page:
doc["id"] = existing_page.id
- is_different = existing_page.content is not None and doc["content"] != existing_page.content
- if existing_page.change_frequency and is_different and not (resource.is_root_like or resource.is_log_root_like):
- doc["change_frequency"] = existing_page.change_frequency + 24 * 3
+ if existing_page.content:
+ is_different = doc["content"] != existing_page.content
+ if is_different:
+ doc["change_frequency"] = resource.get_default_change_frequency("content")
+ else:
+ existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("content")
+ doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "content")
page = Page(**doc)
page.save()
return page, is_different
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -4,6 +4,7 @@ from urllib.robotparser import RobotFileParser
import gusmobile
+from gus import constants
from gus.lib.domain import is_domain
# hack(natpen): the built-in methods in urllib need to know the
@@ -20,6 +21,8 @@ LOG_POST_BOSTON_LIKE_PATTERN = re.compile("^/boston/\d{4}/\d{2}/\d+\.\d+", flags
ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE)
ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE)
+PIKKULOG_LIKE_PATTERN = re.compile(".*/pikkulog/.*", flags=re.IGNORECASE)
+
AUTHOR_URL_PATTERN = re.compile("^/~([a-z][-a-z0-9]*)/|^/users/~?([a-z][-a-z0-9]*)", flags=re.IGNORECASE)
AUTHOR_CONTENT_PATTERN = re.compile(".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE)
@@ -65,6 +68,8 @@ class GeminiResource():
self._is_root_like = None
self._is_log_root_like = None
self._is_log_post_like = None
+ self._is_pikkulog_like = None
+ self._default_change_frequency = None
self.contained_resources = None
def urlsplit_featureful(url, fully_qualified_parent_url=None, parent_hostname=None):
@@ -178,6 +183,16 @@ class GeminiResource():
return self._is_root_like
+ def _get_is_pikkulog_like(self):
+ if self._is_pikkulog_like is None:
+ is_pikkulog_like = False
+ pikkulog_match = PIKKULOG_LIKE_PATTERN.match(self.urlsplit.path)
+ if pikkulog_match:
+ is_pikkulog_like = True
+ self._is_pikkulog_like = is_pikkulog_like
+ return self._is_pikkulog_like
+
+
def _get_is_log_root_like(self):
if self._is_log_root_like is None:
is_log_root_like = False
@@ -194,6 +209,7 @@ class GeminiResource():
post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match(self.urlsplit.path)
post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match(self.urlsplit.path)
post_boston_match = LOG_POST_BOSTON_LIKE_PATTERN.match(self.urlsplit.path)
+
if (post_like_match and not post_like_exclusion_match) or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match) or (self.normalized_host == "gemini.conman.org" and post_boston_match):
is_log_post_like = True
self._is_log_post_like = is_log_post_like
@@ -244,10 +260,54 @@ class GeminiResource():
return friendly_title
- def get_log_post_date(self, content):
- if not self.is_log_post_like:
+ def get_default_change_frequency(self, category):
+ if not self.is_valid:
return None
+ if self._default_change_frequency is None:
+ if category == "content":
+ if self.is_root_like or self.is_log_root_like:
+ change_frequency = constants.DEFAULT_ROOT_CHANGE_FREQUENCY
+ elif self.is_pikkulog_like:
+ change_frequency = constants.DEFAULT_PIKKULOG_CHANGE_FREQUENCY
+ else:
+ change_frequency = constants.DEFAULT_NON_ROOT_CHANGE_FREQUENCY
+ elif category == "binary":
+ change_frequency = constants.DEFAULT_BINARY_CHANGE_FREQUENCY
+ elif category == "redirect":
+ change_frequency = constants.DEFAULT_REDIRECT_CHANGE_FREQUENCY
+ elif category == "temp_error":
+ change_frequency = constants.DEFAULT_TEMP_ERROR_CHANGE_FREQUENCY
+ elif category == "perm_error":
+ change_frequency = constants.DEFAULT_PERM_ERROR_CHANGE_FREQUENCY
+ elif category == "prompt":
+ change_frequency = constants.DEFAULT_PROMPT_CHANGE_FREQUENCY
+ else:
+ raise Exception.NameError("Unrecognized resource category")
+
+ self._default_change_frequency = change_frequency
+ return self._default_change_frequency
+
+ def increment_change_frequency(self, existing_change_frequency, category):
+ if category == "content":
+ if self.is_root_like or self.is_log_root_like:
+ return existing_change_frequency
+ elif self.is_pikkulog_like:
+ return existing_change_frequency + 6
+ else:
+ return existing_change_frequency + 24 * 3
+ elif category == "binary":
+ return existing_change_frequency + 24 * 7
+ elif category == "redirect":
+ return existing_change_frequency + 24 * 2
+ elif category == "temp_error":
+ return existing_change_frequency + 12
+ elif category == "perm_error":
+ return existing_change_frequency + 24 * 30
+ elif category == "prompt":
+ return existing_change_frequency + 24 * 7
+ else:
+ raise Exception.NameError("Unrecognized resource category")
# constructed from fetchable_url
@@ -264,6 +324,7 @@ class GeminiResource():
is_root_like = property(_get_is_root_like)
is_log_root_like = property(_get_is_log_root_like)
is_log_post_like = property(_get_is_log_post_like)
+ is_pikkulog_like = property(_get_is_pikkulog_like)
normalized_host_like = property(_get_normalized_host_like)
def fetch(self):