[threads] Add collapsible log variations - geminispace.info

commit 34be029c6522ba5692722618af7efebe1420b355
parent a2607cd721bf2828c29b3b0dc65573f8bc1fb753
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sun,  2 Aug 2020 05:42:59 -0400

[threads] Add collapsible log variations

Currently this does some work for both duplicated content (the last
two entries) as well as redirects (the first three entries). Fine for
now, but the redirect magic could and should be made more robust by
actually resolving the redirect chain in the index when attempting to
build threads.

Diffstat:
M gus/lib/gemini.py  | 4 ++--
M scripts/build_threads.py  | 48 +++++++++++++++++++++++++++++++++++++++++++-----
M serve/models.py  | 2 +-
M serve/templates/about.gmi  | 10 ++++++++--
M serve/templates/threads.gmi  | 2 +-

5 files changed, 55 insertions(+), 11 deletions(-)
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -13,14 +13,14 @@ uses_netloc.append("gemini")
 
 LOG_ROOT_LIKE_PATTERN = re.compile(".*/(gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/?$", flags=re.IGNORECASE)
 LOG_POST_LIKE_PATTERN = re.compile(".*/((gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/.+$|.*/\d{4}[-_]\d{2}[-_]\d{2}.*$|.*/(19|20)\d{6}.*$)", flags=re.IGNORECASE)
-LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(".*/(games|archive|archives|rss|handlers)/.*|atom.xml$|gemlog.gmi$", flags=re.IGNORECASE)
+LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(".*/(games|archive|archives|rss|handlers|diagnostics)/.*|.*atom.xml$|.*gemlog.gmi$|.*index.gmi$|.*index.gemini$", flags=re.IGNORECASE)
 LOG_POST_GEMLOGBLUE_LIKE_PATTERN = re.compile("^/users/[a-z][-a-z0-9]*/\d+\.gmi?", flags=re.IGNORECASE)
 LOG_POST_BOSTON_LIKE_PATTERN = re.compile("^/boston/\d{4}/\d{2}/\d+\.\d+", flags=re.IGNORECASE)
 
 ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE)
 ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE)
 
-AUTHOR_URL_PATTERN = re.compile("^/~([a-z][-a-z0-9]/*)|^/users/~?([a-z][-a-z0-9]*)", flags=re.IGNORECASE)
+AUTHOR_URL_PATTERN = re.compile("^/~([a-z][-a-z0-9]*)/|^/users/~?([a-z][-a-z0-9]*)", flags=re.IGNORECASE)
 AUTHOR_CONTENT_PATTERN = re.compile(".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE)
 
 TITLE_CONTENT_PATTERN = re.compile("^#\s(.*)$", flags=re.IGNORECASE | re.MULTILINE)
diff --git a/scripts/build_threads.py b/scripts/build_threads.py
@@ -4,6 +4,14 @@ from gus import constants
 from gus.lib.db_model import init_db, Link, Page, Thread, ThreadPage
 from gus.lib.gemini import GeminiResource
 
+collapsible_log_variations = [
+    ("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/3albums/"),
+    ("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/hitenheroes/"),
+    ("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/cornedbeef/"),
+    ("gemini://gemini.circumlunar.space/~", "gemini://gemini.circumlunar.space/users/"),
+    ("gemini://cetacean.club", "gemini://maj.kahless.cetacean.club"),
+]
+
 
 def find_thread_tops(resource, first_seen, page_id, content, current_chain=[]):
     """
@@ -11,6 +19,10 @@ def find_thread_tops(resource, first_seen, page_id, content, current_chain=[]):
     page belongs to, then call recurse_thread on each of them to actually build
     the full threads.
     """
+    for collapsible in collapsible_log_variations:
+        if resource.normalized_url.startswith(collapsible[1]):
+            resource = GeminiResource(collapsible[0] + resource.fetchable_url[len(collapsible[1]):])
+            break
     u = resource.indexable_url.rstrip("/")
     parent_pages_query = Page.raw("""SELECT p_to.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen
 FROM page AS p_from
@@ -30,6 +42,10 @@ ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_
     found_threadable_parents = False
     for parent_page in parent_pages_query.iterator():
         parent_resource = GeminiResource(parent_page.fetchable_url)
+        for collapsible in collapsible_log_variations:
+            if resource.normalized_url.startswith(collapsible[1]):
+                parent_resource = GeminiResource(collapsible[0] + resource.fetchable_url[len(collapsible[1]):])
+                break
         # Skip any parents that are already in the list of seen resources for this call
         # stack - it means they're circular linking
         if any(r for r in current_chain if r.normalized_url == resource.normalized_url):
@@ -45,7 +61,7 @@ ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_
     if not found_threadable_parents:
         # return early if thread top already processed
         try:
-            query = ThreadPage.select().join(Page).where(Page.url == resource.indexable_url, ThreadPage.address == "1")
+            query = ThreadPage.select().join(Page).where(Page.url == resource.indexable_url, ThreadPage.address == "001")
             query.get()
             print(f"\nAlready done: {resource.fetchable_url}")
             return
@@ -74,7 +90,7 @@ ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_
                 friendly_author=m[0].get_friendly_author(m[4]),
                 friendly_title=m[0].get_friendly_title(m[4]),
             )
-            print(" -> [{:<15}] [{}] {}".format(m[1], m[2], m[0].fetchable_url))
+            print(" -> [{:<19}] [{}] {}".format(m[1], m[2], m[0].fetchable_url))
 
 
 def recurse_thread(resource, path, first_seen, page_id, content, current_chain=[]):
@@ -82,6 +98,19 @@ def recurse_thread(resource, path, first_seen, page_id, content, current_chain=[
     # if not resource.is_valid:
         return []
     u = resource.indexable_url.rstrip("/")
+    from_urls = [
+        u,
+        f"{u}/",
+    ]
+    for collapsible in collapsible_log_variations:
+        if resource.normalized_url.startswith(collapsible[1]):
+            new_u = collapsible[0] + resource.indexable_url[len(collapsible[1]):]
+            from_urls.extend([new_u, f"{new_u}/"])
+            break
+        elif resource.normalized_url.startswith(collapsible[0]):
+            new_u = collapsible[1] + resource.indexable_url[len(collapsible[0]):]
+            from_urls.extend([new_u, f"{new_u}/"])
+            break
     children_query = Page.raw("""SELECT p_from.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen
 FROM page AS p_from
 JOIN indexable_crawl AS ic
@@ -92,11 +121,11 @@ JOIN link as l
 ON l.from_page_id == p_from.id
 JOIN page as p_to
 ON p_to.id == l.to_page_id
-WHERE p_to.url IN (?, ?)
+WHERE p_to.url IN (""" + ", ".join(["?" for x in range(len(from_urls))]) + """)
 AND p_from.normalized_url != ?
 AND c.status == 20
 GROUP BY p_from.normalized_url
-ORDER BY l.is_cross_host_like, first_seen ASC""", u, f"{u}/", resource.normalized_url)
+ORDER BY l.is_cross_host_like, first_seen ASC""", *from_urls, resource.normalized_url)
     threadable_child_index = 1
     new_thread_members = [(
         resource,
@@ -105,8 +134,17 @@ ORDER BY l.is_cross_host_like, first_seen ASC""", u, f"{u}/", resource.normalize
         page_id,
         content,
     )]
+    processed_collapsed_urls = []
     for child in children_query.iterator():
-        child_resource = GeminiResource(child.fetchable_url)
+        collapsed_url = child.fetchable_url
+        for collapsible in collapsible_log_variations:
+            if child.normalized_url.startswith(collapsible[1]):
+                collapsed_url = collapsible[0] + child.fetchable_url[len(collapsible[1]):]
+                break
+        if collapsed_url in processed_collapsed_urls:
+            continue
+        processed_collapsed_urls.append(collapsed_url)
+        child_resource = GeminiResource(collapsed_url)
         if is_threadable_link(child_resource, resource, child.is_cross_host_like):
             # Skip any parents that are already in the list of seen resources for this call
             # stack - it means they're circular linking
diff --git a/serve/models.py b/serve/models.py
@@ -100,7 +100,7 @@ FROM (
   SELECT *
   FROM thread
   ORDER BY updated_at DESC
-  LIMIT 20) AS t
+  LIMIT 50) AS t
 JOIN threadpage AS tp
 ON tp.thread_id == t.id
 JOIN page AS p
diff --git a/serve/templates/about.gmi b/serve/templates/about.gmi
@@ -62,10 +62,16 @@ The URL structure for retrieving a certain URL's backlinks page is predictable, 
 
 => gemini://gus.guru/backlinks?gus.guru
 
-Note the distinction between "internal" and "cross-capsule" backlinks. Internal backlinks are backlinks from within your own capsule to the given page. Cross-capsule backlinks are backlinks from other users' capsules.
+Note the distinction between "internal" and "cross-capsule" backlinks. Internal backlinks are backlinks from within your own capsule to the given page. Cross-capsule backlinks are backlinks from other users' capsules. Note that the cross-capsule determination is slightly more advanced than purely checking if the hosts are different - it also takes into account different users on pubnixes, so, for example, gemini://foo.bar, gemini://foo.bar/~ronald, and gemini://foo.bar/~mcdonald would all be considered distinct capsules, as they are all presumably authored and maintained by distinct humans.
 
 {% include 'fragments/footer.gmi' %}
 
 ### Threads (coming soon!)
 
-Oftentimes in Geminispace a post on someone's gemlog will generate a reply on someone else's gemlog. Sometimes many replies! Sometimes the replies generate their own replies! GUS Threads allow you to visualize and explore these threads within Geminispace.
+Oftentimes in Geminispace a post on someone's gemlog will generate a reply on someone else's gemlog. Sometimes many replies! Sometimes the replies generate their own replies! GUS Threads allow you to visualize and explore these threads within Geminispace. You can peruse threads freely, but you can also participate in them without needing any extra software on your end. Inside your reply post, simply link to the post you're replying to (which frankly most are already doing anyway!) and GUS will sort out the rest.
+
+For those interested in more technical detail, what follows is a deeper description of how this functionality works, and how GUS determines which pages are eligible to participate in threads. The first important point is that some pages are indeed _not_ eligible to participate in threads - the point of threads is to capture connected discussion betwwen human authors, so there are rules to determine which pages seem likely to be gemlog posts. A lot of this logic is based on URL structure, and if you nest your post pages within a `gemlog`, `glog`, `log`, `glog`, `posts`, or one of several other similar URL components, GUS will opt the nested pages into threads. It works well in general cases, and I've also added special rules for atypical capsules as I've come across them (still feasible at the current size of Geminispace :), Gemlog Blue and The Boston Diaries.
+
+The next important piece of GUS Threads functionality is that, for all the eligible post pages in Geminispace at a given time, threads are constructed out of cross-capsule links between those pages (see above documentation on backlinks for more information about the cross-capsule distinction).
+
+One nuanced technical limitation is that a given page can only exist in a thread one time. And GUS keeps the "one time" that exists _latest_ in the thread.
diff --git a/serve/templates/threads.gmi b/serve/templates/threads.gmi
@@ -10,7 +10,7 @@
 {% for member in thread["members"] %}
 => {{ member["fetchable_url"] }} [{{ member["first_seen"] | datetimeformat("%b %d") }}] {{member["address"] | threadaddressformat }} {{ member["friendly_author"] }} - {{ member["friendly_title"] }}
 {% endfor %}
-~~~~~~~~~~
+
 {% endfor %}
 
 {% endfor %}

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/lib/gemini.py	\|	4	++--
M	scripts/build_threads.py	\|	48	+++++++++++++++++++++++++++++++++++++++++++-----
M	serve/models.py	\|	2	+-
M	serve/templates/about.gmi	\|	10	++++++++--
M	serve/templates/threads.gmi	\|	2	+-