geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit c341bb82ae237de0fa3aff8b5bec1b3d5efa791c
parent 6c187c2af2168e097c45589482e36e723b541e03
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Thu, 23 Jul 2020 09:44:49 -0400

[crawl] Add is_cross_host_like field to db

Diffstat:
Mgus/crawl.py | 1+
Mgus/lib/db_model.py | 5+++++
Mgus/lib/gemini.py | 19+++++++++++++++++--
Ascripts/add_is_cross_host_like.py | 24++++++++++++++++++++++++
4 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -277,6 +277,7 @@ def index_links(from_resource, contained_resources): data.append({ "from_page": from_page, "to_page": to_page, + "is_cross_host_like": Link.get_is_cross_host_like(from_resource, cr), }) Link.insert_many(data).execute() diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py @@ -47,6 +47,11 @@ class Link(Model): from_page = ForeignKeyField(Page, backref="outbound_links", on_delete='CASCADE') to_page = ForeignKeyField(Page, backref="backlinks", on_delete='CASCADE') + is_cross_host_like = BooleanField() + + def get_is_cross_host_like(from_resource, to_resource): + return from_resource.normalized_host_like != to_resource.normalized_host_like + class Crawl(Model): """ diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -12,7 +12,8 @@ uses_relative.append("gemini") uses_netloc.append("gemini") LOG_LIKE_PATTERN = re.compile(".*/(gemlog|glog|starlog|pikkulog)/?$") -ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$") +ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$") +ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?") class GeminiRobotFileParser(RobotFileParser): def set_url(self, url): @@ -47,6 +48,7 @@ class GeminiResource(): self.fully_qualified_parent_url = fully_qualified_parent_url self._normalized_url = None self._normalized_host = None + self._normalized_host_like = None self._fetchable_url = None self._indexable_url = None self._is_root_like = None @@ -107,6 +109,18 @@ class GeminiResource(): return self._normalized_host + def _get_normalized_host_like(self): + if not self.is_valid: + return None + if self._normalized_host_like is None: + normalized_host_like = self.normalized_host + m = ROOT_LIKE_PATTERN.match(self.urlsplit.path) + if m: + normalized_host_like += m[0].rstrip("/") + self._normalized_host_like = normalized_host_like + return self._normalized_host_like + + def _get_fetchable_url(self): if not self.is_valid: return None @@ -146,7 +160,7 @@ class GeminiResource(): def _get_is_root_like(self): if self._is_root_like is None: is_root_like = False - if self.urlsplit.path == "" or self.urlsplit.path == "/" or ROOT_LIKE_PATTERN.match(self.urlsplit.path): + if self.urlsplit.path == "" or self.urlsplit.path == "/" or ROOT_LIKE_ONLY_PATTERN.match(self.urlsplit.path): is_root_like = True self._is_root_like = is_root_like return self._is_root_like @@ -174,6 +188,7 @@ class GeminiResource(): indexable_url = property(_get_indexable_url) is_root_like = property(_get_is_root_like) is_log_like = property(_get_is_log_like) + normalized_host_like = property(_get_normalized_host_like) def fetch(self): # NB: this intentionally does NOT fetch the normalized URL, because that could diff --git a/scripts/add_is_cross_host_like.py b/scripts/add_is_cross_host_like.py @@ -0,0 +1,24 @@ +from gus import constants +from gus.lib.db_model import init_db, Link, Page +from gus.lib.gemini import GeminiResource, GeminiRobotFileParser + +def main(): + db = init_db(f"index.new/{constants.DB_FILENAME}") + PageFrom = Page.alias() + PageTo = Page.alias() + link_query = (Link + .select(Link, PageFrom, PageTo) + .join(PageFrom, on=(Link.from_page_id == PageFrom.id)) + .join(PageTo, on=(Link.to_page_id == PageTo.id))) + for link in link_query.iterator(): + from_resource = GeminiResource(link.from_page.fetchable_url) + to_resource = GeminiResource(link.to_page.fetchable_url) + is_cross_host_like = Link.get_is_cross_host_like(from_resource, to_resource) + link.is_cross_host_like = is_cross_host_like + link.save() + print("[{}] {} -> {}".format("T" if is_cross_host_like else "F", from_resource.fetchable_url, to_resource.fetchable_url)) + print("\nDone!") + + +if __name__ == "__main__": + main()