commit 8c1399ade960df51af3c53f6bcd8ee7fad0df56e
parent ded0c0ca62fe84b119a4325a53331408328e389d
Author: Natalie Pendragon <natpen@natpen.net>
Date: Mon, 27 Jul 2020 14:50:15 -0400
Threads v1
Diffstat:
8 files changed, 271 insertions(+), 19 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -228,7 +228,7 @@ def index_prompt(resource, response):
def index_content(resource, response):
print("INDEXING CONTENT...")
- change_frequency = constants.DEFAULT_ROOT_CHANGE_FREQUENCY if resource.is_root_like or resource.is_log_like else constants.DEFAULT_NON_ROOT_CHANGE_FREQUENCY
+ change_frequency = constants.DEFAULT_ROOT_CHANGE_FREQUENCY if resource.is_root_like or resource.is_log_root_like else constants.DEFAULT_NON_ROOT_CHANGE_FREQUENCY
doc = {
"url": resource.indexable_url,
"fetchable_url": resource.fetchable_url,
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -10,11 +10,13 @@ from peewee import (
TextField,
)
+from gus.lib.gemini import GeminiResource
+
def init_db(filename=":memory:"):
"""
Bind an SQLite database to the Peewee ORM models.
"""
- models = [Crawl, Link, Page, Search]
+ models = [Crawl, Link, Page, Search, Thread, ThreadPage]
db = SqliteDatabase(filename)
db.bind(models)
db.create_tables(models)
@@ -77,3 +79,18 @@ class Search(Model):
query = TextField()
timestamp = DateTimeField()
+
+class Thread(Model):
+ """
+ Thread definitions.
+ """
+ updated_at = DateTimeField()
+
+class ThreadPage(Model):
+ """
+ Mapping table of threads to their member pages.
+ """
+
+ thread = ForeignKeyField(Thread, backref="pages", on_delete="CASCADE")
+ page = ForeignKeyField(Page, backref="threads", on_delete='CASCADE')
+ address = TextField()
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -11,9 +11,12 @@ from gus.lib.domain import is_domain
uses_relative.append("gemini")
uses_netloc.append("gemini")
-LOG_LIKE_PATTERN = re.compile(".*/(gemlog|glog|starlog|pikkulog)/?$")
-ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$")
-ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?")
+LOG_ROOT_LIKE_PATTERN = re.compile(".*/(gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/?$", flags=re.IGNORECASE)
+LOG_POST_LIKE_PATTERN = re.compile(".*/((gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/.+$|.*/\d{4}[-_]\d{2}[-_]\d{2}.*$|.*/(19|20)\d{6}.*$)", flags=re.IGNORECASE)
+LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(".*/(games|archive|archives|rss)/.*", flags=re.IGNORECASE)
+LOG_POST_GEMLOGBLUE_LIKE_PATTERN = re.compile("^/users/[a-z][-a-z0-9]*/\d+\.gmi?", flags=re.IGNORECASE)
+ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE)
+ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE)
class GeminiRobotFileParser(RobotFileParser):
def set_url(self, url):
@@ -52,7 +55,8 @@ class GeminiResource():
self._fetchable_url = None
self._indexable_url = None
self._is_root_like = None
- self._is_log_like = None
+ self._is_log_root_like = None
+ self._is_log_post_like = None
self.contained_resources = None
def urlsplit_featureful(url, fully_qualified_parent_url=None, parent_hostname=None):
@@ -166,13 +170,25 @@ class GeminiResource():
return self._is_root_like
- def _get_is_log_like(self):
- if self._is_log_like is None:
- is_log_like = False
- if self.urlsplit.path == "" or self.urlsplit.path == "/" or LOG_LIKE_PATTERN.match(self.urlsplit.path):
- is_log_like = True
- self._is_log_like = is_log_like
- return self._is_log_like
+ def _get_is_log_root_like(self):
+ if self._is_log_root_like is None:
+ is_log_root_like = False
+ if self.urlsplit.path == "" or self.urlsplit.path == "/" or LOG_ROOT_LIKE_PATTERN.match(self.urlsplit.path):
+ is_log_root_like = True
+ self._is_log_root_like = is_log_root_like
+ return self._is_log_root_like
+
+
+ def _get_is_log_post_like(self):
+ if self._is_log_post_like is None:
+ is_log_post_like = False
+ post_like_match = LOG_POST_LIKE_PATTERN.match(self.urlsplit.path)
+ post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match(self.urlsplit.path)
+ post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match(self.urlsplit.path)
+ if (post_like_match and not post_like_exclusion_match) or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match):
+ is_log_post_like = True
+ self._is_log_post_like = is_log_post_like
+ return self._is_log_post_like
# constructed from fetchable_url
@@ -187,7 +203,8 @@ class GeminiResource():
# should be unquoted.
indexable_url = property(_get_indexable_url)
is_root_like = property(_get_is_root_like)
- is_log_like = property(_get_is_log_like)
+ is_log_root_like = property(_get_is_log_root_like)
+ is_log_post_like = property(_get_is_log_post_like)
normalized_host_like = property(_get_normalized_host_like)
def fetch(self):
diff --git a/scripts/build_threads.py b/scripts/build_threads.py
@@ -0,0 +1,132 @@
+from peewee import JOIN
+
+from gus import constants
+from gus.lib.db_model import init_db, Link, Page, Thread, ThreadPage
+from gus.lib.gemini import GeminiResource
+
+
+def find_thread_tops(resource, first_seen, page_id, current_chain=[]):
+ """
+ This function will recursively walk up to the tops of all threads a given
+ page belongs to, then call recurse_thread on each of them to actually build
+ the full threads.
+ """
+ u = resource.indexable_url.rstrip("/")
+ parent_pages_query = Page.raw("""SELECT p_to.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen
+FROM page AS p_from
+JOIN indexable_crawl AS ic
+ON ic.page_id == p_to.id
+JOIN crawl AS c
+ON c.page_id == p_to.id
+JOIN link as l
+ON l.from_page_id == p_from.id
+JOIN page as p_to
+ON p_to.id == l.to_page_id
+WHERE p_from.url IN (?, ?)
+AND p_to.normalized_url != ?
+AND c.status == 20
+GROUP BY p_to.normalized_url
+ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_url)
+ found_threadable_parents = False
+ for parent_page in parent_pages_query.iterator():
+ parent_resource = GeminiResource(parent_page.fetchable_url)
+ # Skip any parents that are already in the list of seen resources for this call
+ # stack - it means they're circular linking
+ if any(r for r in current_chain if r.normalized_url == resource.normalized_url):
+ continue
+ if is_threadable_link(resource, parent_resource, parent_page.is_cross_host_like):
+ found_threadable_parents = True
+ find_thread_tops(parent_resource, parent_page.first_seen, parent_page.id, current_chain + [resource])
+ if not found_threadable_parents:
+ # return early if thread top already processed
+ try:
+ query = ThreadPage.select().join(Page).where(Page.url == resource.indexable_url, ThreadPage.address == "1")
+ query.get()
+ print(f"\nAlready done: {resource.fetchable_url}")
+ return
+ except ThreadPage.DoesNotExist:
+ pass
+ full_thread = recurse_thread(resource, "1", first_seen, page_id)
+
+ # Deduplicate
+ full_thread.reverse()
+ i = 0
+ while i < len(full_thread):
+ if any(x for x in full_thread[i+1:] if x[0].normalized_url == full_thread[i][0].normalized_url):
+ full_thread.pop(i)
+ else:
+ i += 1
+ full_thread.reverse()
+
+ thread_updated_at = max(m[2] for m in full_thread)
+ thread = Thread.create(updated_at=thread_updated_at)
+ print()
+ for m in full_thread:
+ ThreadPage.create(thread=thread, page_id=m[3], address=m[1])
+ print(" -> [{:<11}] [{}] {}".format(m[1], m[2], m[0].fetchable_url))
+
+
+def recurse_thread(resource, path, first_seen, page_id, current_chain=[]):
+ if not resource.is_valid or not resource.is_log_post_like:
+ # if not resource.is_valid:
+ return []
+ u = resource.indexable_url.rstrip("/")
+ children_query = Page.raw("""SELECT p_from.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen
+FROM page AS p_from
+JOIN indexable_crawl AS ic
+ON ic.page_id == p_from.id
+JOIN crawl AS c
+ON c.page_id == p_from.id
+JOIN link as l
+ON l.from_page_id == p_from.id
+JOIN page as p_to
+ON p_to.id == l.to_page_id
+WHERE p_to.url IN (?, ?)
+AND p_from.normalized_url != ?
+AND c.status == 20
+GROUP BY p_from.normalized_url
+ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalized_url)
+ threadable_child_index = 1
+ new_thread_members = [(resource, path, first_seen, page_id)]
+ for child in children_query.iterator():
+ child_resource = GeminiResource(child.fetchable_url)
+ if is_threadable_link(child_resource, resource, child.is_cross_host_like):
+ # Skip any parents that are already in the list of seen resources for this call
+ # stack - it means they're circular linking
+ if any(r for r in current_chain if r.normalized_url == resource.normalized_url):
+ continue
+ child_path = f"{path}.{threadable_child_index}"
+ new_thread_members.extend(recurse_thread(child_resource, child_path, child.first_seen, child.id, current_chain + [resource]))
+ threadable_child_index += 1
+ return new_thread_members
+
+
+def is_threadable_link(r1, r2, is_cross_host_like):
+ return r1.is_log_post_like and r2.is_log_post_like and is_cross_host_like
+
+
+def main():
+ db = init_db(f"index/{constants.DB_FILENAME}")
+ Thread.delete().execute()
+ ThreadPage.delete().execute()
+ pages_query = Page.raw("""SELECT p.*, MIN(c.timestamp) AS first_seen
+FROM page AS p
+JOIN indexable_crawl AS ic
+ON ic.page_id == p.id
+JOIN crawl AS c
+ON c.page_id == p.id
+LEFT JOIN threadpage AS tp
+ON tp.page_id == p.id
+WHERE tp.page_id IS NULL
+AND c.status == 20
+GROUP BY p.normalized_url
+""")
+ for page in pages_query.iterator():
+ resource = GeminiResource(page.fetchable_url)
+ if resource.is_valid and resource.is_log_post_like:
+ find_thread_tops(resource, page.first_seen, page.id)
+ print("\nDone!")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/serve/models.py b/serve/models.py
@@ -7,7 +7,7 @@ from whoosh import highlight, qparser
from whoosh.index import open_dir
from . import constants
-from gus.lib.db_model import init_db, Crawl, Link, Page, Search
+from gus.lib.db_model import init_db, Crawl, Link, Page, Search, Thread
from gus.lib.gemini import GeminiResource
from gus.lib.index_statistics import compute_index_statistics, load_all_statistics_from_file
from gus.lib.misc import bytes2human
@@ -69,7 +69,7 @@ class GUS():
return [], []
u = resource.indexable_url.rstrip("/")
- backlinks = Page.raw("""SELECT p_from.url, l.is_cross_host_like
+ backlinks_query = Page.raw("""SELECT p_from.url, l.is_cross_host_like
FROM page AS p_from
JOIN indexable_crawl AS ic
ON ic.page_id == p_from.id
@@ -78,14 +78,62 @@ ON l.from_page_id == p_from.id
JOIN page as p_to
ON p_to.id == l.to_page_id
WHERE p_to.url IN (?, ?)
+AND p_from.normalized_url != ?
GROUP BY p_from.normalized_url
-ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/")
+ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalized_url)
+ backlinks = backlinks_query.execute()
- internal_backlink_urls = [b.url for b in backlinks.execute() if not b.is_cross_host_like]
- external_backlink_urls = [b.url for b in backlinks.execute() if b.is_cross_host_like]
+ internal_backlink_urls = [b.url for b in backlinks if not b.is_cross_host_like]
+ external_backlink_urls = [b.url for b in backlinks if b.is_cross_host_like]
return internal_backlink_urls, external_backlink_urls
+ def get_threads(self):
+ threads_query = Thread.raw("""SELECT t.*
+ , tp.address
+ , p.fetchable_url
+ , p.url
+ , MIN(c.timestamp) AS first_seen
+FROM (
+ SELECT *
+ FROM thread
+ ORDER BY updated_at DESC
+ LIMIT 20) AS t
+JOIN threadpage AS tp
+ON tp.thread_id == t.id
+JOIN page AS p
+ON p.id == tp.page_id
+JOIN crawl AS c
+ON c.page_id == p.id
+WHERE c.status == 20
+GROUP BY tp.id
+ORDER BY t.updated_at DESC, t.id ASC, tp.address ASC""")
+ threads = []
+ last_date = None
+ last_id = None
+ for thread_member in threads_query.iterator():
+ if thread_member.updated_at.date() != last_date:
+ threads.append({
+ "threads": [],
+ "date": thread_member.updated_at,
+ })
+ last_date = thread_member.updated_at.date()
+ if thread_member.id != last_id:
+ threads[-1]["threads"].append({
+ "members": [],
+ "updated_at": thread_member.updated_at,
+ })
+ last_id = thread_member.id
+ threads[-1]["threads"][-1]["members"].append({
+ "url": thread_member.url,
+ "fetchable_url": thread_member.fetchable_url,
+ "address": thread_member.address,
+ "first_seen": datetime.strptime(thread_member.first_seen, "%Y-%m-%d %H:%M:%S.%f"),
+ })
+ # return sorted(threads, key=lambda x: (x["updated_at"], ), reverse=True)
+ return threads
+
+
def _get_link_text(result):
if result["content_type"] == "input":
prompt_suffix = ": {}".format(result["prompt"])
diff --git a/serve/templates/about.gmi b/serve/templates/about.gmi
@@ -65,3 +65,7 @@ The URL structure for retrieving a certain URL's backlinks page is predictable,
Note the distinction between "internal" and "cross-capsule" backlinks. Internal backlinks are backlinks from within your own capsule to the given page. Cross-capsule backlinks are backlinks from other users' capsules.
{% include 'fragments/footer.gmi' %}
+
+### Threads (coming soon!)
+
+Oftentimes in Geminispace a post on someone's gemlog will generate a reply on someone else's gemlog. Sometimes many replies! Sometimes the replies generate their own replies! GUS Threads allow you to visualize and explore these threads within Geminispace.
diff --git a/serve/templates/threads.gmi b/serve/templates/threads.gmi
@@ -0,0 +1,17 @@
+{% include 'fragments/header.gmi' %}
+
+
+## Threads
+
+{% for date in threads %}
+### {{ date["date"] | datetimeformat("%Y, %b %d") }}
+
+{% for thread in date["threads"] %}
+{% for member in thread["members"] %}
+=> {{ member["fetchable_url"] }} [{{ member["first_seen"] | datetimeformat("%b %d") }}] {{member["address"] | threadaddressformat }} {{ member["url"][9:] }}
+{% endfor %}
+~~~~~~~~~~
+{% endfor %}
+
+{% endfor %}
+{% include 'fragments/footer.gmi' %}
diff --git a/serve/views.py b/serve/views.py
@@ -20,7 +20,15 @@ template_env = jinja2.Environment(
def datetimeformat(value, format="%Y-%m-%d"):
return value.strftime(format)
+
+def threadaddressformat(value):
+ depth = len(value.split("."))
+ if depth > 1:
+ return " " * (depth - 1) + "↳"
+ return ""
+
template_env.filters['datetimeformat'] = datetimeformat
+template_env.filters['threadaddressformat'] = threadaddressformat
def render_template(name: str, *args, **kwargs) -> str:
"""
@@ -132,3 +140,12 @@ def backlinks(request):
return Response(Status.SUCCESS, "text/gemini", body)
else:
return Response(Status.INPUT, "Gemini URL")
+
+
+@app.route("/threads")
+def threads(request):
+ threads = gus.get_threads()
+ body = render_template("threads.gmi",
+ threads=threads,
+ index_modification_time=gus.statistics["index_modification_time"])
+ return Response(Status.SUCCESS, "text/gemini", body)