Threads v1 - geminispace.info - gemini search engine

commit 8c1399ade960df51af3c53f6bcd8ee7fad0df56e
parent ded0c0ca62fe84b119a4325a53331408328e389d
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon, 27 Jul 2020 14:50:15 -0400

Threads v1

Diffstat:
M gus/crawl.py  | 2 +-
M gus/lib/db_model.py  | 19 ++++++++++++++++++-
M gus/lib/gemini.py  | 41 +++++++++++++++++++++++++++++------------
A scripts/build_threads.py  | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M serve/models.py  | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M serve/templates/about.gmi  | 4 ++++
A serve/templates/threads.gmi  | 17 +++++++++++++++++
M serve/views.py  | 17 +++++++++++++++++

8 files changed, 271 insertions(+), 19 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -228,7 +228,7 @@ def index_prompt(resource, response):
 
 def index_content(resource, response):
     print("INDEXING CONTENT...")
-    change_frequency = constants.DEFAULT_ROOT_CHANGE_FREQUENCY if resource.is_root_like or resource.is_log_like else constants.DEFAULT_NON_ROOT_CHANGE_FREQUENCY
+    change_frequency = constants.DEFAULT_ROOT_CHANGE_FREQUENCY if resource.is_root_like or resource.is_log_root_like else constants.DEFAULT_NON_ROOT_CHANGE_FREQUENCY
     doc = {
         "url": resource.indexable_url,
         "fetchable_url": resource.fetchable_url,
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -10,11 +10,13 @@ from peewee import (
     TextField,
 )
 
+from gus.lib.gemini import GeminiResource
+
 def init_db(filename=":memory:"):
     """
     Bind an SQLite database to the Peewee ORM models.
     """
-    models = [Crawl, Link, Page, Search]
+    models = [Crawl, Link, Page, Search, Thread, ThreadPage]
     db = SqliteDatabase(filename)
     db.bind(models)
     db.create_tables(models)
@@ -77,3 +79,18 @@ class Search(Model):
 
     query = TextField()
     timestamp = DateTimeField()
+
+class Thread(Model):
+    """
+    Thread definitions.
+    """
+    updated_at = DateTimeField()
+
+class ThreadPage(Model):
+    """
+    Mapping table of threads to their member pages.
+    """
+
+    thread = ForeignKeyField(Thread, backref="pages", on_delete="CASCADE")
+    page = ForeignKeyField(Page, backref="threads", on_delete='CASCADE')
+    address = TextField()
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -11,9 +11,12 @@ from gus.lib.domain import is_domain
 uses_relative.append("gemini")
 uses_netloc.append("gemini")
 
-LOG_LIKE_PATTERN = re.compile(".*/(gemlog|glog|starlog|pikkulog)/?$")
-ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$")
-ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?")
+LOG_ROOT_LIKE_PATTERN = re.compile(".*/(gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/?$", flags=re.IGNORECASE)
+LOG_POST_LIKE_PATTERN = re.compile(".*/((gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/.+$|.*/\d{4}[-_]\d{2}[-_]\d{2}.*$|.*/(19|20)\d{6}.*$)", flags=re.IGNORECASE)
+LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(".*/(games|archive|archives|rss)/.*", flags=re.IGNORECASE)
+LOG_POST_GEMLOGBLUE_LIKE_PATTERN = re.compile("^/users/[a-z][-a-z0-9]*/\d+\.gmi?", flags=re.IGNORECASE)
+ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE)
+ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE)
 
 class GeminiRobotFileParser(RobotFileParser):
     def set_url(self, url):
@@ -52,7 +55,8 @@ class GeminiResource():
         self._fetchable_url = None
         self._indexable_url = None
         self._is_root_like = None
-        self._is_log_like = None
+        self._is_log_root_like = None
+        self._is_log_post_like = None
         self.contained_resources = None
 
     def urlsplit_featureful(url, fully_qualified_parent_url=None, parent_hostname=None):
@@ -166,13 +170,25 @@ class GeminiResource():
         return self._is_root_like
 
 
-    def _get_is_log_like(self):
-        if self._is_log_like is None:
-            is_log_like = False
-            if self.urlsplit.path == "" or self.urlsplit.path == "/" or LOG_LIKE_PATTERN.match(self.urlsplit.path):
-                is_log_like = True
-            self._is_log_like = is_log_like
-        return self._is_log_like
+    def _get_is_log_root_like(self):
+        if self._is_log_root_like is None:
+            is_log_root_like = False
+            if self.urlsplit.path == "" or self.urlsplit.path == "/" or LOG_ROOT_LIKE_PATTERN.match(self.urlsplit.path):
+                is_log_root_like = True
+            self._is_log_root_like = is_log_root_like
+        return self._is_log_root_like
+
+
+    def _get_is_log_post_like(self):
+        if self._is_log_post_like is None:
+            is_log_post_like = False
+            post_like_match = LOG_POST_LIKE_PATTERN.match(self.urlsplit.path)
+            post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match(self.urlsplit.path)
+            post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match(self.urlsplit.path)
+            if (post_like_match and not post_like_exclusion_match) or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match):
+                is_log_post_like = True
+            self._is_log_post_like = is_log_post_like
+        return self._is_log_post_like
 
 
     # constructed from fetchable_url
@@ -187,7 +203,8 @@ class GeminiResource():
     # should be unquoted.
     indexable_url = property(_get_indexable_url)
     is_root_like = property(_get_is_root_like)
-    is_log_like = property(_get_is_log_like)
+    is_log_root_like = property(_get_is_log_root_like)
+    is_log_post_like = property(_get_is_log_post_like)
     normalized_host_like = property(_get_normalized_host_like)
 
     def fetch(self):
diff --git a/scripts/build_threads.py b/scripts/build_threads.py
@@ -0,0 +1,132 @@
+from peewee import JOIN
+
+from gus import constants
+from gus.lib.db_model import init_db, Link, Page, Thread, ThreadPage
+from gus.lib.gemini import GeminiResource
+
+
+def find_thread_tops(resource, first_seen, page_id, current_chain=[]):
+    """
+    This function will recursively walk up to the tops of all threads a given
+    page belongs to, then call recurse_thread on each of them to actually build
+    the full threads.
+    """
+    u = resource.indexable_url.rstrip("/")
+    parent_pages_query = Page.raw("""SELECT p_to.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen
+FROM page AS p_from
+JOIN indexable_crawl AS ic
+ON ic.page_id == p_to.id
+JOIN crawl AS c
+ON c.page_id == p_to.id
+JOIN link as l
+ON l.from_page_id == p_from.id
+JOIN page as p_to
+ON p_to.id == l.to_page_id
+WHERE p_from.url IN (?, ?)
+AND p_to.normalized_url != ?
+AND c.status == 20
+GROUP BY p_to.normalized_url
+ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_url)
+    found_threadable_parents = False
+    for parent_page in parent_pages_query.iterator():
+        parent_resource = GeminiResource(parent_page.fetchable_url)
+        # Skip any parents that are already in the list of seen resources for this call
+        # stack - it means they're circular linking
+        if any(r for r in current_chain if r.normalized_url == resource.normalized_url):
+            continue
+        if is_threadable_link(resource, parent_resource, parent_page.is_cross_host_like):
+            found_threadable_parents = True
+            find_thread_tops(parent_resource, parent_page.first_seen, parent_page.id, current_chain + [resource])
+    if not found_threadable_parents:
+        # return early if thread top already processed
+        try:
+            query = ThreadPage.select().join(Page).where(Page.url == resource.indexable_url, ThreadPage.address == "1")
+            query.get()
+            print(f"\nAlready done: {resource.fetchable_url}")
+            return
+        except ThreadPage.DoesNotExist:
+            pass
+        full_thread = recurse_thread(resource, "1", first_seen, page_id)
+
+        # Deduplicate
+        full_thread.reverse()
+        i = 0
+        while i < len(full_thread):
+            if any(x for x in full_thread[i+1:] if x[0].normalized_url == full_thread[i][0].normalized_url):
+                full_thread.pop(i)
+            else:
+                i += 1
+        full_thread.reverse()
+
+        thread_updated_at = max(m[2] for m in full_thread)
+        thread = Thread.create(updated_at=thread_updated_at)
+        print()
+        for m in full_thread:
+            ThreadPage.create(thread=thread, page_id=m[3], address=m[1])
+            print(" -> [{:<11}] [{}] {}".format(m[1], m[2], m[0].fetchable_url))
+
+
+def recurse_thread(resource, path, first_seen, page_id, current_chain=[]):
+    if not resource.is_valid or not resource.is_log_post_like:
+    # if not resource.is_valid:
+        return []
+    u = resource.indexable_url.rstrip("/")
+    children_query = Page.raw("""SELECT p_from.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen
+FROM page AS p_from
+JOIN indexable_crawl AS ic
+ON ic.page_id == p_from.id
+JOIN crawl AS c
+ON c.page_id == p_from.id
+JOIN link as l
+ON l.from_page_id == p_from.id
+JOIN page as p_to
+ON p_to.id == l.to_page_id
+WHERE p_to.url IN (?, ?)
+AND p_from.normalized_url != ?
+AND c.status == 20
+GROUP BY p_from.normalized_url
+ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalized_url)
+    threadable_child_index = 1
+    new_thread_members = [(resource, path, first_seen, page_id)]
+    for child in children_query.iterator():
+        child_resource = GeminiResource(child.fetchable_url)
+        if is_threadable_link(child_resource, resource, child.is_cross_host_like):
+            # Skip any parents that are already in the list of seen resources for this call
+            # stack - it means they're circular linking
+            if any(r for r in current_chain if r.normalized_url == resource.normalized_url):
+                continue
+            child_path = f"{path}.{threadable_child_index}"
+            new_thread_members.extend(recurse_thread(child_resource, child_path, child.first_seen, child.id, current_chain + [resource]))
+            threadable_child_index += 1
+    return new_thread_members
+
+
+def is_threadable_link(r1, r2, is_cross_host_like):
+    return r1.is_log_post_like and r2.is_log_post_like and is_cross_host_like
+
+
+def main():
+    db = init_db(f"index/{constants.DB_FILENAME}")
+    Thread.delete().execute()
+    ThreadPage.delete().execute()
+    pages_query = Page.raw("""SELECT p.*, MIN(c.timestamp) AS first_seen
+FROM page AS p
+JOIN indexable_crawl AS ic
+ON ic.page_id == p.id
+JOIN crawl AS c
+ON c.page_id == p.id
+LEFT JOIN threadpage AS tp
+ON tp.page_id == p.id
+WHERE tp.page_id IS NULL
+AND c.status == 20
+GROUP BY p.normalized_url
+""")
+    for page in pages_query.iterator():
+        resource = GeminiResource(page.fetchable_url)
+        if resource.is_valid and resource.is_log_post_like:
+            find_thread_tops(resource, page.first_seen, page.id)
+    print("\nDone!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/serve/models.py b/serve/models.py
@@ -7,7 +7,7 @@ from whoosh import highlight, qparser
 from whoosh.index import open_dir
 
 from . import constants
-from gus.lib.db_model import init_db, Crawl, Link, Page, Search
+from gus.lib.db_model import init_db, Crawl, Link, Page, Search, Thread
 from gus.lib.gemini import GeminiResource
 from gus.lib.index_statistics import compute_index_statistics, load_all_statistics_from_file
 from gus.lib.misc import bytes2human
@@ -69,7 +69,7 @@ class GUS():
             return [], []
 
         u = resource.indexable_url.rstrip("/")
-        backlinks = Page.raw("""SELECT p_from.url, l.is_cross_host_like
+        backlinks_query = Page.raw("""SELECT p_from.url, l.is_cross_host_like
 FROM page AS p_from
 JOIN indexable_crawl AS ic
 ON ic.page_id == p_from.id
@@ -78,14 +78,62 @@ ON l.from_page_id == p_from.id
 JOIN page as p_to
 ON p_to.id == l.to_page_id
 WHERE p_to.url IN (?, ?)
+AND p_from.normalized_url != ?
 GROUP BY p_from.normalized_url
-ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/")
+ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalized_url)
+        backlinks = backlinks_query.execute()
 
-        internal_backlink_urls = [b.url for b in backlinks.execute() if not b.is_cross_host_like]
-        external_backlink_urls = [b.url for b in backlinks.execute() if b.is_cross_host_like]
+        internal_backlink_urls = [b.url for b in backlinks if not b.is_cross_host_like]
+        external_backlink_urls = [b.url for b in backlinks if b.is_cross_host_like]
         return internal_backlink_urls, external_backlink_urls
 
 
+    def get_threads(self):
+        threads_query = Thread.raw("""SELECT t.*
+  , tp.address
+  , p.fetchable_url
+  , p.url
+  , MIN(c.timestamp) AS first_seen
+FROM (
+  SELECT *
+  FROM thread
+  ORDER BY updated_at DESC
+  LIMIT 20) AS t
+JOIN threadpage AS tp
+ON tp.thread_id == t.id
+JOIN page AS p
+ON p.id == tp.page_id
+JOIN crawl AS c
+ON c.page_id == p.id
+WHERE c.status == 20
+GROUP BY tp.id
+ORDER BY t.updated_at DESC, t.id ASC, tp.address ASC""")
+        threads = []
+        last_date = None
+        last_id = None
+        for thread_member in threads_query.iterator():
+            if thread_member.updated_at.date() != last_date:
+                threads.append({
+                    "threads": [],
+                    "date": thread_member.updated_at,
+                })
+                last_date = thread_member.updated_at.date()
+            if thread_member.id != last_id:
+                threads[-1]["threads"].append({
+                    "members": [],
+                    "updated_at": thread_member.updated_at,
+                })
+                last_id = thread_member.id
+            threads[-1]["threads"][-1]["members"].append({
+                "url": thread_member.url,
+                "fetchable_url": thread_member.fetchable_url,
+                "address": thread_member.address,
+                "first_seen": datetime.strptime(thread_member.first_seen, "%Y-%m-%d %H:%M:%S.%f"),
+            })
+        # return sorted(threads, key=lambda x: (x["updated_at"], ), reverse=True)
+        return threads
+
+
     def _get_link_text(result):
         if result["content_type"] == "input":
             prompt_suffix = ": {}".format(result["prompt"])
diff --git a/serve/templates/about.gmi b/serve/templates/about.gmi
@@ -65,3 +65,7 @@ The URL structure for retrieving a certain URL's backlinks page is predictable, 
 Note the distinction between "internal" and "cross-capsule" backlinks. Internal backlinks are backlinks from within your own capsule to the given page. Cross-capsule backlinks are backlinks from other users' capsules.
 
 {% include 'fragments/footer.gmi' %}
+
+### Threads (coming soon!)
+
+Oftentimes in Geminispace a post on someone's gemlog will generate a reply on someone else's gemlog. Sometimes many replies! Sometimes the replies generate their own replies! GUS Threads allow you to visualize and explore these threads within Geminispace.
diff --git a/serve/templates/threads.gmi b/serve/templates/threads.gmi
@@ -0,0 +1,17 @@
+{% include 'fragments/header.gmi' %}
+
+
+## Threads
+
+{% for date in threads %}
+### {{ date["date"] | datetimeformat("%Y, %b %d") }}
+
+{% for thread in date["threads"] %}
+{% for member in thread["members"] %}
+=> {{ member["fetchable_url"] }} [{{ member["first_seen"] | datetimeformat("%b %d") }}] {{member["address"] | threadaddressformat }} {{ member["url"][9:] }}
+{% endfor %}
+~~~~~~~~~~
+{% endfor %}
+
+{% endfor %}
+{% include 'fragments/footer.gmi' %}
diff --git a/serve/views.py b/serve/views.py
@@ -20,7 +20,15 @@ template_env = jinja2.Environment(
 def datetimeformat(value, format="%Y-%m-%d"):
     return value.strftime(format)
 
+
+def threadaddressformat(value):
+    depth = len(value.split("."))
+    if depth > 1:
+        return "   " * (depth - 1) + "↳"
+    return ""
+
 template_env.filters['datetimeformat'] = datetimeformat
+template_env.filters['threadaddressformat'] = threadaddressformat
 
 def render_template(name: str, *args, **kwargs) -> str:
     """
@@ -132,3 +140,12 @@ def backlinks(request):
         return Response(Status.SUCCESS, "text/gemini", body)
     else:
         return Response(Status.INPUT, "Gemini URL")
+
+
+@app.route("/threads")
+def threads(request):
+    threads = gus.get_threads()
+    body = render_template("threads.gmi",
+                           threads=threads,
+                           index_modification_time=gus.statistics["index_modification_time"])
+    return Response(Status.SUCCESS, "text/gemini", body)

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/crawl.py	\|	2	+-
M	gus/lib/db_model.py	\|	19	++++++++++++++++++-
M	gus/lib/gemini.py	\|	41	+++++++++++++++++++++++++++++------------
A	scripts/build_threads.py	\|	132	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	serve/models.py	\|	58	+++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M	serve/templates/about.gmi	\|	4	++++
A	serve/templates/threads.gmi	\|	17	+++++++++++++++++
M	serve/views.py	\|	17	+++++++++++++++++