commit c341bb82ae237de0fa3aff8b5bec1b3d5efa791c
parent 6c187c2af2168e097c45589482e36e723b541e03
Author: Natalie Pendragon <natpen@natpen.net>
Date: Thu, 23 Jul 2020 09:44:49 -0400
[crawl] Add is_cross_host_like field to db
Diffstat:
4 files changed, 47 insertions(+), 2 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -277,6 +277,7 @@ def index_links(from_resource, contained_resources):
data.append({
"from_page": from_page,
"to_page": to_page,
+ "is_cross_host_like": Link.get_is_cross_host_like(from_resource, cr),
})
Link.insert_many(data).execute()
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -47,6 +47,11 @@ class Link(Model):
from_page = ForeignKeyField(Page, backref="outbound_links", on_delete='CASCADE')
to_page = ForeignKeyField(Page, backref="backlinks", on_delete='CASCADE')
+ is_cross_host_like = BooleanField()
+
+ def get_is_cross_host_like(from_resource, to_resource):
+ return from_resource.normalized_host_like != to_resource.normalized_host_like
+
class Crawl(Model):
"""
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -12,7 +12,8 @@ uses_relative.append("gemini")
uses_netloc.append("gemini")
LOG_LIKE_PATTERN = re.compile(".*/(gemlog|glog|starlog|pikkulog)/?$")
-ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$")
+ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$")
+ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?")
class GeminiRobotFileParser(RobotFileParser):
def set_url(self, url):
@@ -47,6 +48,7 @@ class GeminiResource():
self.fully_qualified_parent_url = fully_qualified_parent_url
self._normalized_url = None
self._normalized_host = None
+ self._normalized_host_like = None
self._fetchable_url = None
self._indexable_url = None
self._is_root_like = None
@@ -107,6 +109,18 @@ class GeminiResource():
return self._normalized_host
+ def _get_normalized_host_like(self):
+ if not self.is_valid:
+ return None
+ if self._normalized_host_like is None:
+ normalized_host_like = self.normalized_host
+ m = ROOT_LIKE_PATTERN.match(self.urlsplit.path)
+ if m:
+ normalized_host_like += m[0].rstrip("/")
+ self._normalized_host_like = normalized_host_like
+ return self._normalized_host_like
+
+
def _get_fetchable_url(self):
if not self.is_valid:
return None
@@ -146,7 +160,7 @@ class GeminiResource():
def _get_is_root_like(self):
if self._is_root_like is None:
is_root_like = False
- if self.urlsplit.path == "" or self.urlsplit.path == "/" or ROOT_LIKE_PATTERN.match(self.urlsplit.path):
+ if self.urlsplit.path == "" or self.urlsplit.path == "/" or ROOT_LIKE_ONLY_PATTERN.match(self.urlsplit.path):
is_root_like = True
self._is_root_like = is_root_like
return self._is_root_like
@@ -174,6 +188,7 @@ class GeminiResource():
indexable_url = property(_get_indexable_url)
is_root_like = property(_get_is_root_like)
is_log_like = property(_get_is_log_like)
+ normalized_host_like = property(_get_normalized_host_like)
def fetch(self):
# NB: this intentionally does NOT fetch the normalized URL, because that could
diff --git a/scripts/add_is_cross_host_like.py b/scripts/add_is_cross_host_like.py
@@ -0,0 +1,24 @@
+from gus import constants
+from gus.lib.db_model import init_db, Link, Page
+from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
+
+def main():
+ db = init_db(f"index.new/{constants.DB_FILENAME}")
+ PageFrom = Page.alias()
+ PageTo = Page.alias()
+ link_query = (Link
+ .select(Link, PageFrom, PageTo)
+ .join(PageFrom, on=(Link.from_page_id == PageFrom.id))
+ .join(PageTo, on=(Link.to_page_id == PageTo.id)))
+ for link in link_query.iterator():
+ from_resource = GeminiResource(link.from_page.fetchable_url)
+ to_resource = GeminiResource(link.to_page.fetchable_url)
+ is_cross_host_like = Link.get_is_cross_host_like(from_resource, to_resource)
+ link.is_cross_host_like = is_cross_host_like
+ link.save()
+ print("[{}] {} -> {}".format("T" if is_cross_host_like else "F", from_resource.fetchable_url, to_resource.fetchable_url))
+ print("\nDone!")
+
+
+if __name__ == "__main__":
+ main()