geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit f75751e5b939a8b828206cfe9e3eaf0c3f73edcb
parent 8c1399ade960df51af3c53f6bcd8ee7fad0df56e
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Tue, 28 Jul 2020 07:02:50 -0400

Add friendly authors and titles for threads

Diffstat:
Mgus/lib/db_model.py | 2++
Mgus/lib/gemini.py | 63+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Mscripts/build_threads.py | 40++++++++++++++++++++++++++++++++--------
Mserve/models.py | 4++++
Mserve/templates/threads.gmi | 2+-
5 files changed, 100 insertions(+), 11 deletions(-)

diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py @@ -94,3 +94,5 @@ class ThreadPage(Model): thread = ForeignKeyField(Thread, backref="pages", on_delete="CASCADE") page = ForeignKeyField(Page, backref="threads", on_delete='CASCADE') address = TextField() + friendly_author = TextField() + friendly_title = TextField() diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -13,11 +13,19 @@ uses_netloc.append("gemini") LOG_ROOT_LIKE_PATTERN = re.compile(".*/(gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/?$", flags=re.IGNORECASE) LOG_POST_LIKE_PATTERN = re.compile(".*/((gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/.+$|.*/\d{4}[-_]\d{2}[-_]\d{2}.*$|.*/(19|20)\d{6}.*$)", flags=re.IGNORECASE) -LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(".*/(games|archive|archives|rss)/.*", flags=re.IGNORECASE) +LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(".*/(games|archive|archives|rss|handlers)/.*|atom.xml$|gemlog.gmi$", flags=re.IGNORECASE) LOG_POST_GEMLOGBLUE_LIKE_PATTERN = re.compile("^/users/[a-z][-a-z0-9]*/\d+\.gmi?", flags=re.IGNORECASE) +LOG_POST_BOSTON_LIKE_PATTERN = re.compile("^/boston/\d{4}/\d{2}/\d+\.\d+", flags=re.IGNORECASE) + ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE) ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE) +AUTHOR_URL_PATTERN = re.compile("^/~([a-z][-a-z0-9]/*)|^/users/~?([a-z][-a-z0-9]*)", flags=re.IGNORECASE) +AUTHOR_CONTENT_PATTERN = re.compile(".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE) + +TITLE_CONTENT_PATTERN = re.compile("^#\s(.*)$", flags=re.IGNORECASE | re.MULTILINE) +TITLE_URL_PATTERN = re.compile(".*/(\d{8}[-_]|\d{4}[-_]\d{2}[-_]\d{2}[-_])?([a-z0-9-_]+)(\.[a-z0-9]+)$", flags=re.IGNORECASE) + class GeminiRobotFileParser(RobotFileParser): def set_url(self, url): """Sets the URL referring to a robots.txt file.""" @@ -185,12 +193,63 @@ class GeminiResource(): post_like_match = LOG_POST_LIKE_PATTERN.match(self.urlsplit.path) post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match(self.urlsplit.path) post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match(self.urlsplit.path) - if (post_like_match and not post_like_exclusion_match) or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match): + post_boston_match = LOG_POST_BOSTON_LIKE_PATTERN.match(self.urlsplit.path) + if (post_like_match and not post_like_exclusion_match) or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match) or (self.normalized_host == "gemini.conman.org" and post_boston_match): is_log_post_like = True self._is_log_post_like = is_log_post_like return self._is_log_post_like + def get_friendly_author(self, content): + if not self.is_valid: + return None + friendly_author = None + author_url_match = AUTHOR_URL_PATTERN.match(self.urlsplit.path) + if author_url_match: + # first check url + if author_url_match[1]: + friendly_author = author_url_match[1] + elif author_url_match[2]: + friendly_author = author_url_match[2] + if friendly_author is None: + # if no URL match, try looking in page content + if isinstance(content, str): + author_content_match = AUTHOR_CONTENT_PATTERN.match(content) + if author_content_match: + friendly_author = author_content_match[1] + if friendly_author is None: + # if still no match, use normalized host + friendly_author = self.normalized_host + return friendly_author + + + def get_friendly_title(self, content): + if not self.is_valid: + return None + friendly_title = None + + if isinstance(content, str): + title_content_match = TITLE_CONTENT_PATTERN.match(content) + if title_content_match: + # first try page content + friendly_title = title_content_match[1] + if friendly_title is None: + # if no content match, try looking in URL + title_url_match = TITLE_URL_PATTERN.match(self.urlsplit.path) + if title_url_match: + friendly_title = title_url_match[2].replace("-", " ").replace("_", " ").strip().title() + if friendly_title is None: + # if still no match, use URL path + friendly_title = self.urlsplit.path.lstrip("/") + return friendly_title + + + def get_log_post_date(self, content): + if not self.is_log_post_like: + return None + + + # constructed from fetchable_url # does not matter if quoted or unquoted so I choose arbitrarily to # standardize on unquoting it. diff --git a/scripts/build_threads.py b/scripts/build_threads.py @@ -5,7 +5,7 @@ from gus.lib.db_model import init_db, Link, Page, Thread, ThreadPage from gus.lib.gemini import GeminiResource -def find_thread_tops(resource, first_seen, page_id, current_chain=[]): +def find_thread_tops(resource, first_seen, page_id, content, current_chain=[]): """ This function will recursively walk up to the tops of all threads a given page belongs to, then call recurse_thread on each of them to actually build @@ -36,7 +36,12 @@ ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_ continue if is_threadable_link(resource, parent_resource, parent_page.is_cross_host_like): found_threadable_parents = True - find_thread_tops(parent_resource, parent_page.first_seen, parent_page.id, current_chain + [resource]) + find_thread_tops( + parent_resource, + parent_page.first_seen, + parent_page.id, + parent_page.content, + current_chain + [resource]) if not found_threadable_parents: # return early if thread top already processed try: @@ -46,7 +51,7 @@ ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_ return except ThreadPage.DoesNotExist: pass - full_thread = recurse_thread(resource, "1", first_seen, page_id) + full_thread = recurse_thread(resource, "1", first_seen, page_id, content) # Deduplicate full_thread.reverse() @@ -62,11 +67,17 @@ ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_ thread = Thread.create(updated_at=thread_updated_at) print() for m in full_thread: - ThreadPage.create(thread=thread, page_id=m[3], address=m[1]) + ThreadPage.create( + thread=thread, + page_id=m[3], + address=m[1], + friendly_author=m[0].get_friendly_author(m[4]), + friendly_title=m[0].get_friendly_title(m[4]), + ) print(" -> [{:<11}] [{}] {}".format(m[1], m[2], m[0].fetchable_url)) -def recurse_thread(resource, path, first_seen, page_id, current_chain=[]): +def recurse_thread(resource, path, first_seen, page_id, content, current_chain=[]): if not resource.is_valid or not resource.is_log_post_like: # if not resource.is_valid: return [] @@ -87,7 +98,13 @@ AND c.status == 20 GROUP BY p_from.normalized_url ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalized_url) threadable_child_index = 1 - new_thread_members = [(resource, path, first_seen, page_id)] + new_thread_members = [( + resource, + path, + first_seen, + page_id, + content, + )] for child in children_query.iterator(): child_resource = GeminiResource(child.fetchable_url) if is_threadable_link(child_resource, resource, child.is_cross_host_like): @@ -96,7 +113,14 @@ ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalize if any(r for r in current_chain if r.normalized_url == resource.normalized_url): continue child_path = f"{path}.{threadable_child_index}" - new_thread_members.extend(recurse_thread(child_resource, child_path, child.first_seen, child.id, current_chain + [resource])) + new_thread_members.extend(recurse_thread( + child_resource, + child_path, + child.first_seen, + child.id, + child.content, + current_chain + [resource] + )) threadable_child_index += 1 return new_thread_members @@ -124,7 +148,7 @@ GROUP BY p.normalized_url for page in pages_query.iterator(): resource = GeminiResource(page.fetchable_url) if resource.is_valid and resource.is_log_post_like: - find_thread_tops(resource, page.first_seen, page.id) + find_thread_tops(resource, page.first_seen, page.id, page.content) print("\nDone!") diff --git a/serve/models.py b/serve/models.py @@ -91,6 +91,8 @@ ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalize def get_threads(self): threads_query = Thread.raw("""SELECT t.* , tp.address + , tp.friendly_author + , tp.friendly_title , p.fetchable_url , p.url , MIN(c.timestamp) AS first_seen @@ -128,6 +130,8 @@ ORDER BY t.updated_at DESC, t.id ASC, tp.address ASC""") "url": thread_member.url, "fetchable_url": thread_member.fetchable_url, "address": thread_member.address, + "friendly_author": thread_member.friendly_author, + "friendly_title": thread_member.friendly_title, "first_seen": datetime.strptime(thread_member.first_seen, "%Y-%m-%d %H:%M:%S.%f"), }) # return sorted(threads, key=lambda x: (x["updated_at"], ), reverse=True) diff --git a/serve/templates/threads.gmi b/serve/templates/threads.gmi @@ -8,7 +8,7 @@ {% for thread in date["threads"] %} {% for member in thread["members"] %} -=> {{ member["fetchable_url"] }} [{{ member["first_seen"] | datetimeformat("%b %d") }}] {{member["address"] | threadaddressformat }} {{ member["url"][9:] }} +=> {{ member["fetchable_url"] }} [{{ member["first_seen"] | datetimeformat("%b %d") }}] {{member["address"] | threadaddressformat }} {{ member["friendly_author"] }} - {{ member["friendly_title"] }} {% endfor %} ~~~~~~~~~~ {% endfor %}