commit f75751e5b939a8b828206cfe9e3eaf0c3f73edcb
parent 8c1399ade960df51af3c53f6bcd8ee7fad0df56e
Author: Natalie Pendragon <natpen@natpen.net>
Date: Tue, 28 Jul 2020 07:02:50 -0400
Add friendly authors and titles for threads
Diffstat:
5 files changed, 100 insertions(+), 11 deletions(-)
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -94,3 +94,5 @@ class ThreadPage(Model):
thread = ForeignKeyField(Thread, backref="pages", on_delete="CASCADE")
page = ForeignKeyField(Page, backref="threads", on_delete='CASCADE')
address = TextField()
+ friendly_author = TextField()
+ friendly_title = TextField()
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -13,11 +13,19 @@ uses_netloc.append("gemini")
LOG_ROOT_LIKE_PATTERN = re.compile(".*/(gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/?$", flags=re.IGNORECASE)
LOG_POST_LIKE_PATTERN = re.compile(".*/((gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/.+$|.*/\d{4}[-_]\d{2}[-_]\d{2}.*$|.*/(19|20)\d{6}.*$)", flags=re.IGNORECASE)
-LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(".*/(games|archive|archives|rss)/.*", flags=re.IGNORECASE)
+LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(".*/(games|archive|archives|rss|handlers)/.*|atom.xml$|gemlog.gmi$", flags=re.IGNORECASE)
LOG_POST_GEMLOGBLUE_LIKE_PATTERN = re.compile("^/users/[a-z][-a-z0-9]*/\d+\.gmi?", flags=re.IGNORECASE)
+LOG_POST_BOSTON_LIKE_PATTERN = re.compile("^/boston/\d{4}/\d{2}/\d+\.\d+", flags=re.IGNORECASE)
+
ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE)
ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE)
+AUTHOR_URL_PATTERN = re.compile("^/~([a-z][-a-z0-9]/*)|^/users/~?([a-z][-a-z0-9]*)", flags=re.IGNORECASE)
+AUTHOR_CONTENT_PATTERN = re.compile(".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE)
+
+TITLE_CONTENT_PATTERN = re.compile("^#\s(.*)$", flags=re.IGNORECASE | re.MULTILINE)
+TITLE_URL_PATTERN = re.compile(".*/(\d{8}[-_]|\d{4}[-_]\d{2}[-_]\d{2}[-_])?([a-z0-9-_]+)(\.[a-z0-9]+)$", flags=re.IGNORECASE)
+
class GeminiRobotFileParser(RobotFileParser):
def set_url(self, url):
"""Sets the URL referring to a robots.txt file."""
@@ -185,12 +193,63 @@ class GeminiResource():
post_like_match = LOG_POST_LIKE_PATTERN.match(self.urlsplit.path)
post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match(self.urlsplit.path)
post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match(self.urlsplit.path)
- if (post_like_match and not post_like_exclusion_match) or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match):
+ post_boston_match = LOG_POST_BOSTON_LIKE_PATTERN.match(self.urlsplit.path)
+ if (post_like_match and not post_like_exclusion_match) or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match) or (self.normalized_host == "gemini.conman.org" and post_boston_match):
is_log_post_like = True
self._is_log_post_like = is_log_post_like
return self._is_log_post_like
+ def get_friendly_author(self, content):
+ if not self.is_valid:
+ return None
+ friendly_author = None
+ author_url_match = AUTHOR_URL_PATTERN.match(self.urlsplit.path)
+ if author_url_match:
+ # first check url
+ if author_url_match[1]:
+ friendly_author = author_url_match[1]
+ elif author_url_match[2]:
+ friendly_author = author_url_match[2]
+ if friendly_author is None:
+ # if no URL match, try looking in page content
+ if isinstance(content, str):
+ author_content_match = AUTHOR_CONTENT_PATTERN.match(content)
+ if author_content_match:
+ friendly_author = author_content_match[1]
+ if friendly_author is None:
+ # if still no match, use normalized host
+ friendly_author = self.normalized_host
+ return friendly_author
+
+
+ def get_friendly_title(self, content):
+ if not self.is_valid:
+ return None
+ friendly_title = None
+
+ if isinstance(content, str):
+ title_content_match = TITLE_CONTENT_PATTERN.match(content)
+ if title_content_match:
+ # first try page content
+ friendly_title = title_content_match[1]
+ if friendly_title is None:
+ # if no content match, try looking in URL
+ title_url_match = TITLE_URL_PATTERN.match(self.urlsplit.path)
+ if title_url_match:
+ friendly_title = title_url_match[2].replace("-", " ").replace("_", " ").strip().title()
+ if friendly_title is None:
+ # if still no match, use URL path
+ friendly_title = self.urlsplit.path.lstrip("/")
+ return friendly_title
+
+
+ def get_log_post_date(self, content):
+ if not self.is_log_post_like:
+ return None
+
+
+
# constructed from fetchable_url
# does not matter if quoted or unquoted so I choose arbitrarily to
# standardize on unquoting it.
diff --git a/scripts/build_threads.py b/scripts/build_threads.py
@@ -5,7 +5,7 @@ from gus.lib.db_model import init_db, Link, Page, Thread, ThreadPage
from gus.lib.gemini import GeminiResource
-def find_thread_tops(resource, first_seen, page_id, current_chain=[]):
+def find_thread_tops(resource, first_seen, page_id, content, current_chain=[]):
"""
This function will recursively walk up to the tops of all threads a given
page belongs to, then call recurse_thread on each of them to actually build
@@ -36,7 +36,12 @@ ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_
continue
if is_threadable_link(resource, parent_resource, parent_page.is_cross_host_like):
found_threadable_parents = True
- find_thread_tops(parent_resource, parent_page.first_seen, parent_page.id, current_chain + [resource])
+ find_thread_tops(
+ parent_resource,
+ parent_page.first_seen,
+ parent_page.id,
+ parent_page.content,
+ current_chain + [resource])
if not found_threadable_parents:
# return early if thread top already processed
try:
@@ -46,7 +51,7 @@ ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_
return
except ThreadPage.DoesNotExist:
pass
- full_thread = recurse_thread(resource, "1", first_seen, page_id)
+ full_thread = recurse_thread(resource, "1", first_seen, page_id, content)
# Deduplicate
full_thread.reverse()
@@ -62,11 +67,17 @@ ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_
thread = Thread.create(updated_at=thread_updated_at)
print()
for m in full_thread:
- ThreadPage.create(thread=thread, page_id=m[3], address=m[1])
+ ThreadPage.create(
+ thread=thread,
+ page_id=m[3],
+ address=m[1],
+ friendly_author=m[0].get_friendly_author(m[4]),
+ friendly_title=m[0].get_friendly_title(m[4]),
+ )
print(" -> [{:<11}] [{}] {}".format(m[1], m[2], m[0].fetchable_url))
-def recurse_thread(resource, path, first_seen, page_id, current_chain=[]):
+def recurse_thread(resource, path, first_seen, page_id, content, current_chain=[]):
if not resource.is_valid or not resource.is_log_post_like:
# if not resource.is_valid:
return []
@@ -87,7 +98,13 @@ AND c.status == 20
GROUP BY p_from.normalized_url
ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalized_url)
threadable_child_index = 1
- new_thread_members = [(resource, path, first_seen, page_id)]
+ new_thread_members = [(
+ resource,
+ path,
+ first_seen,
+ page_id,
+ content,
+ )]
for child in children_query.iterator():
child_resource = GeminiResource(child.fetchable_url)
if is_threadable_link(child_resource, resource, child.is_cross_host_like):
@@ -96,7 +113,14 @@ ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalize
if any(r for r in current_chain if r.normalized_url == resource.normalized_url):
continue
child_path = f"{path}.{threadable_child_index}"
- new_thread_members.extend(recurse_thread(child_resource, child_path, child.first_seen, child.id, current_chain + [resource]))
+ new_thread_members.extend(recurse_thread(
+ child_resource,
+ child_path,
+ child.first_seen,
+ child.id,
+ child.content,
+ current_chain + [resource]
+ ))
threadable_child_index += 1
return new_thread_members
@@ -124,7 +148,7 @@ GROUP BY p.normalized_url
for page in pages_query.iterator():
resource = GeminiResource(page.fetchable_url)
if resource.is_valid and resource.is_log_post_like:
- find_thread_tops(resource, page.first_seen, page.id)
+ find_thread_tops(resource, page.first_seen, page.id, page.content)
print("\nDone!")
diff --git a/serve/models.py b/serve/models.py
@@ -91,6 +91,8 @@ ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalize
def get_threads(self):
threads_query = Thread.raw("""SELECT t.*
, tp.address
+ , tp.friendly_author
+ , tp.friendly_title
, p.fetchable_url
, p.url
, MIN(c.timestamp) AS first_seen
@@ -128,6 +130,8 @@ ORDER BY t.updated_at DESC, t.id ASC, tp.address ASC""")
"url": thread_member.url,
"fetchable_url": thread_member.fetchable_url,
"address": thread_member.address,
+ "friendly_author": thread_member.friendly_author,
+ "friendly_title": thread_member.friendly_title,
"first_seen": datetime.strptime(thread_member.first_seen, "%Y-%m-%d %H:%M:%S.%f"),
})
# return sorted(threads, key=lambda x: (x["updated_at"], ), reverse=True)
diff --git a/serve/templates/threads.gmi b/serve/templates/threads.gmi
@@ -8,7 +8,7 @@
{% for thread in date["threads"] %}
{% for member in thread["members"] %}
-=> {{ member["fetchable_url"] }} [{{ member["first_seen"] | datetimeformat("%b %d") }}] {{member["address"] | threadaddressformat }} {{ member["url"][9:] }}
+=> {{ member["fetchable_url"] }} [{{ member["first_seen"] | datetimeformat("%b %d") }}] {{member["address"] | threadaddressformat }} {{ member["friendly_author"] }} - {{ member["friendly_title"] }}
{% endfor %}
~~~~~~~~~~
{% endfor %}