Add friendly authors and titles for threads - geminispace.info

commit f75751e5b939a8b828206cfe9e3eaf0c3f73edcb
parent 8c1399ade960df51af3c53f6bcd8ee7fad0df56e
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Tue, 28 Jul 2020 07:02:50 -0400

Add friendly authors and titles for threads

Diffstat:
M gus/lib/db_model.py  | 2 ++
M gus/lib/gemini.py  | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M scripts/build_threads.py  | 40 ++++++++++++++++++++++++++++++++--------
M serve/models.py  | 4 ++++
M serve/templates/threads.gmi  | 2 +-

5 files changed, 100 insertions(+), 11 deletions(-)
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -94,3 +94,5 @@ class ThreadPage(Model):
     thread = ForeignKeyField(Thread, backref="pages", on_delete="CASCADE")
     page = ForeignKeyField(Page, backref="threads", on_delete='CASCADE')
     address = TextField()
+    friendly_author = TextField()
+    friendly_title = TextField()
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -13,11 +13,19 @@ uses_netloc.append("gemini")
 
 LOG_ROOT_LIKE_PATTERN = re.compile(".*/(gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/?$", flags=re.IGNORECASE)
 LOG_POST_LIKE_PATTERN = re.compile(".*/((gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/.+$|.*/\d{4}[-_]\d{2}[-_]\d{2}.*$|.*/(19|20)\d{6}.*$)", flags=re.IGNORECASE)
-LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(".*/(games|archive|archives|rss)/.*", flags=re.IGNORECASE)
+LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(".*/(games|archive|archives|rss|handlers)/.*|atom.xml$|gemlog.gmi$", flags=re.IGNORECASE)
 LOG_POST_GEMLOGBLUE_LIKE_PATTERN = re.compile("^/users/[a-z][-a-z0-9]*/\d+\.gmi?", flags=re.IGNORECASE)
+LOG_POST_BOSTON_LIKE_PATTERN = re.compile("^/boston/\d{4}/\d{2}/\d+\.\d+", flags=re.IGNORECASE)
+
 ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE)
 ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE)
 
+AUTHOR_URL_PATTERN = re.compile("^/~([a-z][-a-z0-9]/*)|^/users/~?([a-z][-a-z0-9]*)", flags=re.IGNORECASE)
+AUTHOR_CONTENT_PATTERN = re.compile(".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE)
+
+TITLE_CONTENT_PATTERN = re.compile("^#\s(.*)$", flags=re.IGNORECASE | re.MULTILINE)
+TITLE_URL_PATTERN = re.compile(".*/(\d{8}[-_]|\d{4}[-_]\d{2}[-_]\d{2}[-_])?([a-z0-9-_]+)(\.[a-z0-9]+)$", flags=re.IGNORECASE)
+
 class GeminiRobotFileParser(RobotFileParser):
     def set_url(self, url):
         """Sets the URL referring to a robots.txt file."""
@@ -185,12 +193,63 @@ class GeminiResource():
             post_like_match = LOG_POST_LIKE_PATTERN.match(self.urlsplit.path)
             post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match(self.urlsplit.path)
             post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match(self.urlsplit.path)
-            if (post_like_match and not post_like_exclusion_match) or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match):
+            post_boston_match = LOG_POST_BOSTON_LIKE_PATTERN.match(self.urlsplit.path)
+            if (post_like_match and not post_like_exclusion_match) or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match) or (self.normalized_host == "gemini.conman.org" and post_boston_match):
                 is_log_post_like = True
             self._is_log_post_like = is_log_post_like
         return self._is_log_post_like
 
 
+    def get_friendly_author(self, content):
+        if not self.is_valid:
+            return None
+        friendly_author = None
+        author_url_match = AUTHOR_URL_PATTERN.match(self.urlsplit.path)
+        if author_url_match:
+            # first check url
+            if author_url_match[1]:
+                friendly_author = author_url_match[1]
+            elif author_url_match[2]:
+                friendly_author = author_url_match[2]
+        if friendly_author is None:
+            # if no URL match, try looking in page content
+            if isinstance(content, str):
+                author_content_match = AUTHOR_CONTENT_PATTERN.match(content)
+                if author_content_match:
+                    friendly_author = author_content_match[1]
+        if friendly_author is None:
+            # if still no match, use normalized host
+            friendly_author = self.normalized_host
+        return friendly_author
+
+
+    def get_friendly_title(self, content):
+        if not self.is_valid:
+            return None
+        friendly_title = None
+
+        if isinstance(content, str):
+            title_content_match = TITLE_CONTENT_PATTERN.match(content)
+            if title_content_match:
+                # first try page content
+                friendly_title = title_content_match[1]
+        if friendly_title is None:
+            # if no content match, try looking in URL
+            title_url_match = TITLE_URL_PATTERN.match(self.urlsplit.path)
+            if title_url_match:
+                friendly_title = title_url_match[2].replace("-", " ").replace("_", " ").strip().title()
+        if friendly_title is None:
+            # if still no match, use URL path
+            friendly_title = self.urlsplit.path.lstrip("/")
+        return friendly_title
+
+
+    def get_log_post_date(self, content):
+        if not self.is_log_post_like:
+            return None
+
+
+
     # constructed from fetchable_url
     # does not matter if quoted or unquoted so I choose arbitrarily to
     # standardize on unquoting it.
diff --git a/scripts/build_threads.py b/scripts/build_threads.py
@@ -5,7 +5,7 @@ from gus.lib.db_model import init_db, Link, Page, Thread, ThreadPage
 from gus.lib.gemini import GeminiResource
 
 
-def find_thread_tops(resource, first_seen, page_id, current_chain=[]):
+def find_thread_tops(resource, first_seen, page_id, content, current_chain=[]):
     """
     This function will recursively walk up to the tops of all threads a given
     page belongs to, then call recurse_thread on each of them to actually build
@@ -36,7 +36,12 @@ ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_
             continue
         if is_threadable_link(resource, parent_resource, parent_page.is_cross_host_like):
             found_threadable_parents = True
-            find_thread_tops(parent_resource, parent_page.first_seen, parent_page.id, current_chain + [resource])
+            find_thread_tops(
+                parent_resource,
+                parent_page.first_seen,
+                parent_page.id,
+                parent_page.content,
+                current_chain + [resource])
     if not found_threadable_parents:
         # return early if thread top already processed
         try:
@@ -46,7 +51,7 @@ ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_
             return
         except ThreadPage.DoesNotExist:
             pass
-        full_thread = recurse_thread(resource, "1", first_seen, page_id)
+        full_thread = recurse_thread(resource, "1", first_seen, page_id, content)
 
         # Deduplicate
         full_thread.reverse()
@@ -62,11 +67,17 @@ ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_
         thread = Thread.create(updated_at=thread_updated_at)
         print()
         for m in full_thread:
-            ThreadPage.create(thread=thread, page_id=m[3], address=m[1])
+            ThreadPage.create(
+                thread=thread,
+                page_id=m[3],
+                address=m[1],
+                friendly_author=m[0].get_friendly_author(m[4]),
+                friendly_title=m[0].get_friendly_title(m[4]),
+            )
             print(" -> [{:<11}] [{}] {}".format(m[1], m[2], m[0].fetchable_url))
 
 
-def recurse_thread(resource, path, first_seen, page_id, current_chain=[]):
+def recurse_thread(resource, path, first_seen, page_id, content, current_chain=[]):
     if not resource.is_valid or not resource.is_log_post_like:
     # if not resource.is_valid:
         return []
@@ -87,7 +98,13 @@ AND c.status == 20
 GROUP BY p_from.normalized_url
 ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalized_url)
     threadable_child_index = 1
-    new_thread_members = [(resource, path, first_seen, page_id)]
+    new_thread_members = [(
+        resource,
+        path,
+        first_seen,
+        page_id,
+        content,
+    )]
     for child in children_query.iterator():
         child_resource = GeminiResource(child.fetchable_url)
         if is_threadable_link(child_resource, resource, child.is_cross_host_like):
@@ -96,7 +113,14 @@ ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalize
             if any(r for r in current_chain if r.normalized_url == resource.normalized_url):
                 continue
             child_path = f"{path}.{threadable_child_index}"
-            new_thread_members.extend(recurse_thread(child_resource, child_path, child.first_seen, child.id, current_chain + [resource]))
+            new_thread_members.extend(recurse_thread(
+                child_resource,
+                child_path,
+                child.first_seen,
+                child.id,
+                child.content,
+                current_chain + [resource]
+            ))
             threadable_child_index += 1
     return new_thread_members
 
@@ -124,7 +148,7 @@ GROUP BY p.normalized_url
     for page in pages_query.iterator():
         resource = GeminiResource(page.fetchable_url)
         if resource.is_valid and resource.is_log_post_like:
-            find_thread_tops(resource, page.first_seen, page.id)
+            find_thread_tops(resource, page.first_seen, page.id, page.content)
     print("\nDone!")
 
 
diff --git a/serve/models.py b/serve/models.py
@@ -91,6 +91,8 @@ ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalize
     def get_threads(self):
         threads_query = Thread.raw("""SELECT t.*
   , tp.address
+  , tp.friendly_author
+  , tp.friendly_title
   , p.fetchable_url
   , p.url
   , MIN(c.timestamp) AS first_seen
@@ -128,6 +130,8 @@ ORDER BY t.updated_at DESC, t.id ASC, tp.address ASC""")
                 "url": thread_member.url,
                 "fetchable_url": thread_member.fetchable_url,
                 "address": thread_member.address,
+                "friendly_author": thread_member.friendly_author,
+                "friendly_title": thread_member.friendly_title,
                 "first_seen": datetime.strptime(thread_member.first_seen, "%Y-%m-%d %H:%M:%S.%f"),
             })
         # return sorted(threads, key=lambda x: (x["updated_at"], ), reverse=True)
diff --git a/serve/templates/threads.gmi b/serve/templates/threads.gmi
@@ -8,7 +8,7 @@
 
 {% for thread in date["threads"] %}
 {% for member in thread["members"] %}
-=> {{ member["fetchable_url"] }} [{{ member["first_seen"] | datetimeformat("%b %d") }}] {{member["address"] | threadaddressformat }} {{ member["url"][9:] }}
+=> {{ member["fetchable_url"] }} [{{ member["first_seen"] | datetimeformat("%b %d") }}] {{member["address"] | threadaddressformat }} {{ member["friendly_author"] }} - {{ member["friendly_title"] }}
 {% endfor %}
 ~~~~~~~~~~
 {% endfor %}

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/lib/db_model.py	\|	2	++
M	gus/lib/gemini.py	\|	63	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M	scripts/build_threads.py	\|	40	++++++++++++++++++++++++++++++++--------
M	serve/models.py	\|	4	++++
M	serve/templates/threads.gmi	\|	2	+-