geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 8c1399ade960df51af3c53f6bcd8ee7fad0df56e
parent ded0c0ca62fe84b119a4325a53331408328e389d
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon, 27 Jul 2020 14:50:15 -0400

Threads v1

Diffstat:
Mgus/crawl.py | 2+-
Mgus/lib/db_model.py | 19++++++++++++++++++-
Mgus/lib/gemini.py | 41+++++++++++++++++++++++++++++------------
Ascripts/build_threads.py | 132+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mserve/models.py | 58+++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mserve/templates/about.gmi | 4++++
Aserve/templates/threads.gmi | 17+++++++++++++++++
Mserve/views.py | 17+++++++++++++++++
8 files changed, 271 insertions(+), 19 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -228,7 +228,7 @@ def index_prompt(resource, response): def index_content(resource, response): print("INDEXING CONTENT...") - change_frequency = constants.DEFAULT_ROOT_CHANGE_FREQUENCY if resource.is_root_like or resource.is_log_like else constants.DEFAULT_NON_ROOT_CHANGE_FREQUENCY + change_frequency = constants.DEFAULT_ROOT_CHANGE_FREQUENCY if resource.is_root_like or resource.is_log_root_like else constants.DEFAULT_NON_ROOT_CHANGE_FREQUENCY doc = { "url": resource.indexable_url, "fetchable_url": resource.fetchable_url, diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py @@ -10,11 +10,13 @@ from peewee import ( TextField, ) +from gus.lib.gemini import GeminiResource + def init_db(filename=":memory:"): """ Bind an SQLite database to the Peewee ORM models. """ - models = [Crawl, Link, Page, Search] + models = [Crawl, Link, Page, Search, Thread, ThreadPage] db = SqliteDatabase(filename) db.bind(models) db.create_tables(models) @@ -77,3 +79,18 @@ class Search(Model): query = TextField() timestamp = DateTimeField() + +class Thread(Model): + """ + Thread definitions. + """ + updated_at = DateTimeField() + +class ThreadPage(Model): + """ + Mapping table of threads to their member pages. + """ + + thread = ForeignKeyField(Thread, backref="pages", on_delete="CASCADE") + page = ForeignKeyField(Page, backref="threads", on_delete='CASCADE') + address = TextField() diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -11,9 +11,12 @@ from gus.lib.domain import is_domain uses_relative.append("gemini") uses_netloc.append("gemini") -LOG_LIKE_PATTERN = re.compile(".*/(gemlog|glog|starlog|pikkulog)/?$") -ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$") -ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?") +LOG_ROOT_LIKE_PATTERN = re.compile(".*/(gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/?$", flags=re.IGNORECASE) +LOG_POST_LIKE_PATTERN = re.compile(".*/((gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/.+$|.*/\d{4}[-_]\d{2}[-_]\d{2}.*$|.*/(19|20)\d{6}.*$)", flags=re.IGNORECASE) +LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(".*/(games|archive|archives|rss)/.*", flags=re.IGNORECASE) +LOG_POST_GEMLOGBLUE_LIKE_PATTERN = re.compile("^/users/[a-z][-a-z0-9]*/\d+\.gmi?", flags=re.IGNORECASE) +ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE) +ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE) class GeminiRobotFileParser(RobotFileParser): def set_url(self, url): @@ -52,7 +55,8 @@ class GeminiResource(): self._fetchable_url = None self._indexable_url = None self._is_root_like = None - self._is_log_like = None + self._is_log_root_like = None + self._is_log_post_like = None self.contained_resources = None def urlsplit_featureful(url, fully_qualified_parent_url=None, parent_hostname=None): @@ -166,13 +170,25 @@ class GeminiResource(): return self._is_root_like - def _get_is_log_like(self): - if self._is_log_like is None: - is_log_like = False - if self.urlsplit.path == "" or self.urlsplit.path == "/" or LOG_LIKE_PATTERN.match(self.urlsplit.path): - is_log_like = True - self._is_log_like = is_log_like - return self._is_log_like + def _get_is_log_root_like(self): + if self._is_log_root_like is None: + is_log_root_like = False + if self.urlsplit.path == "" or self.urlsplit.path == "/" or LOG_ROOT_LIKE_PATTERN.match(self.urlsplit.path): + is_log_root_like = True + self._is_log_root_like = is_log_root_like + return self._is_log_root_like + + + def _get_is_log_post_like(self): + if self._is_log_post_like is None: + is_log_post_like = False + post_like_match = LOG_POST_LIKE_PATTERN.match(self.urlsplit.path) + post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match(self.urlsplit.path) + post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match(self.urlsplit.path) + if (post_like_match and not post_like_exclusion_match) or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match): + is_log_post_like = True + self._is_log_post_like = is_log_post_like + return self._is_log_post_like # constructed from fetchable_url @@ -187,7 +203,8 @@ class GeminiResource(): # should be unquoted. indexable_url = property(_get_indexable_url) is_root_like = property(_get_is_root_like) - is_log_like = property(_get_is_log_like) + is_log_root_like = property(_get_is_log_root_like) + is_log_post_like = property(_get_is_log_post_like) normalized_host_like = property(_get_normalized_host_like) def fetch(self): diff --git a/scripts/build_threads.py b/scripts/build_threads.py @@ -0,0 +1,132 @@ +from peewee import JOIN + +from gus import constants +from gus.lib.db_model import init_db, Link, Page, Thread, ThreadPage +from gus.lib.gemini import GeminiResource + + +def find_thread_tops(resource, first_seen, page_id, current_chain=[]): + """ + This function will recursively walk up to the tops of all threads a given + page belongs to, then call recurse_thread on each of them to actually build + the full threads. + """ + u = resource.indexable_url.rstrip("/") + parent_pages_query = Page.raw("""SELECT p_to.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen +FROM page AS p_from +JOIN indexable_crawl AS ic +ON ic.page_id == p_to.id +JOIN crawl AS c +ON c.page_id == p_to.id +JOIN link as l +ON l.from_page_id == p_from.id +JOIN page as p_to +ON p_to.id == l.to_page_id +WHERE p_from.url IN (?, ?) +AND p_to.normalized_url != ? +AND c.status == 20 +GROUP BY p_to.normalized_url +ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_url) + found_threadable_parents = False + for parent_page in parent_pages_query.iterator(): + parent_resource = GeminiResource(parent_page.fetchable_url) + # Skip any parents that are already in the list of seen resources for this call + # stack - it means they're circular linking + if any(r for r in current_chain if r.normalized_url == resource.normalized_url): + continue + if is_threadable_link(resource, parent_resource, parent_page.is_cross_host_like): + found_threadable_parents = True + find_thread_tops(parent_resource, parent_page.first_seen, parent_page.id, current_chain + [resource]) + if not found_threadable_parents: + # return early if thread top already processed + try: + query = ThreadPage.select().join(Page).where(Page.url == resource.indexable_url, ThreadPage.address == "1") + query.get() + print(f"\nAlready done: {resource.fetchable_url}") + return + except ThreadPage.DoesNotExist: + pass + full_thread = recurse_thread(resource, "1", first_seen, page_id) + + # Deduplicate + full_thread.reverse() + i = 0 + while i < len(full_thread): + if any(x for x in full_thread[i+1:] if x[0].normalized_url == full_thread[i][0].normalized_url): + full_thread.pop(i) + else: + i += 1 + full_thread.reverse() + + thread_updated_at = max(m[2] for m in full_thread) + thread = Thread.create(updated_at=thread_updated_at) + print() + for m in full_thread: + ThreadPage.create(thread=thread, page_id=m[3], address=m[1]) + print(" -> [{:<11}] [{}] {}".format(m[1], m[2], m[0].fetchable_url)) + + +def recurse_thread(resource, path, first_seen, page_id, current_chain=[]): + if not resource.is_valid or not resource.is_log_post_like: + # if not resource.is_valid: + return [] + u = resource.indexable_url.rstrip("/") + children_query = Page.raw("""SELECT p_from.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen +FROM page AS p_from +JOIN indexable_crawl AS ic +ON ic.page_id == p_from.id +JOIN crawl AS c +ON c.page_id == p_from.id +JOIN link as l +ON l.from_page_id == p_from.id +JOIN page as p_to +ON p_to.id == l.to_page_id +WHERE p_to.url IN (?, ?) +AND p_from.normalized_url != ? +AND c.status == 20 +GROUP BY p_from.normalized_url +ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalized_url) + threadable_child_index = 1 + new_thread_members = [(resource, path, first_seen, page_id)] + for child in children_query.iterator(): + child_resource = GeminiResource(child.fetchable_url) + if is_threadable_link(child_resource, resource, child.is_cross_host_like): + # Skip any parents that are already in the list of seen resources for this call + # stack - it means they're circular linking + if any(r for r in current_chain if r.normalized_url == resource.normalized_url): + continue + child_path = f"{path}.{threadable_child_index}" + new_thread_members.extend(recurse_thread(child_resource, child_path, child.first_seen, child.id, current_chain + [resource])) + threadable_child_index += 1 + return new_thread_members + + +def is_threadable_link(r1, r2, is_cross_host_like): + return r1.is_log_post_like and r2.is_log_post_like and is_cross_host_like + + +def main(): + db = init_db(f"index/{constants.DB_FILENAME}") + Thread.delete().execute() + ThreadPage.delete().execute() + pages_query = Page.raw("""SELECT p.*, MIN(c.timestamp) AS first_seen +FROM page AS p +JOIN indexable_crawl AS ic +ON ic.page_id == p.id +JOIN crawl AS c +ON c.page_id == p.id +LEFT JOIN threadpage AS tp +ON tp.page_id == p.id +WHERE tp.page_id IS NULL +AND c.status == 20 +GROUP BY p.normalized_url +""") + for page in pages_query.iterator(): + resource = GeminiResource(page.fetchable_url) + if resource.is_valid and resource.is_log_post_like: + find_thread_tops(resource, page.first_seen, page.id) + print("\nDone!") + + +if __name__ == "__main__": + main() diff --git a/serve/models.py b/serve/models.py @@ -7,7 +7,7 @@ from whoosh import highlight, qparser from whoosh.index import open_dir from . import constants -from gus.lib.db_model import init_db, Crawl, Link, Page, Search +from gus.lib.db_model import init_db, Crawl, Link, Page, Search, Thread from gus.lib.gemini import GeminiResource from gus.lib.index_statistics import compute_index_statistics, load_all_statistics_from_file from gus.lib.misc import bytes2human @@ -69,7 +69,7 @@ class GUS(): return [], [] u = resource.indexable_url.rstrip("/") - backlinks = Page.raw("""SELECT p_from.url, l.is_cross_host_like + backlinks_query = Page.raw("""SELECT p_from.url, l.is_cross_host_like FROM page AS p_from JOIN indexable_crawl AS ic ON ic.page_id == p_from.id @@ -78,14 +78,62 @@ ON l.from_page_id == p_from.id JOIN page as p_to ON p_to.id == l.to_page_id WHERE p_to.url IN (?, ?) +AND p_from.normalized_url != ? GROUP BY p_from.normalized_url -ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/") +ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalized_url) + backlinks = backlinks_query.execute() - internal_backlink_urls = [b.url for b in backlinks.execute() if not b.is_cross_host_like] - external_backlink_urls = [b.url for b in backlinks.execute() if b.is_cross_host_like] + internal_backlink_urls = [b.url for b in backlinks if not b.is_cross_host_like] + external_backlink_urls = [b.url for b in backlinks if b.is_cross_host_like] return internal_backlink_urls, external_backlink_urls + def get_threads(self): + threads_query = Thread.raw("""SELECT t.* + , tp.address + , p.fetchable_url + , p.url + , MIN(c.timestamp) AS first_seen +FROM ( + SELECT * + FROM thread + ORDER BY updated_at DESC + LIMIT 20) AS t +JOIN threadpage AS tp +ON tp.thread_id == t.id +JOIN page AS p +ON p.id == tp.page_id +JOIN crawl AS c +ON c.page_id == p.id +WHERE c.status == 20 +GROUP BY tp.id +ORDER BY t.updated_at DESC, t.id ASC, tp.address ASC""") + threads = [] + last_date = None + last_id = None + for thread_member in threads_query.iterator(): + if thread_member.updated_at.date() != last_date: + threads.append({ + "threads": [], + "date": thread_member.updated_at, + }) + last_date = thread_member.updated_at.date() + if thread_member.id != last_id: + threads[-1]["threads"].append({ + "members": [], + "updated_at": thread_member.updated_at, + }) + last_id = thread_member.id + threads[-1]["threads"][-1]["members"].append({ + "url": thread_member.url, + "fetchable_url": thread_member.fetchable_url, + "address": thread_member.address, + "first_seen": datetime.strptime(thread_member.first_seen, "%Y-%m-%d %H:%M:%S.%f"), + }) + # return sorted(threads, key=lambda x: (x["updated_at"], ), reverse=True) + return threads + + def _get_link_text(result): if result["content_type"] == "input": prompt_suffix = ": {}".format(result["prompt"]) diff --git a/serve/templates/about.gmi b/serve/templates/about.gmi @@ -65,3 +65,7 @@ The URL structure for retrieving a certain URL's backlinks page is predictable, Note the distinction between "internal" and "cross-capsule" backlinks. Internal backlinks are backlinks from within your own capsule to the given page. Cross-capsule backlinks are backlinks from other users' capsules. {% include 'fragments/footer.gmi' %} + +### Threads (coming soon!) + +Oftentimes in Geminispace a post on someone's gemlog will generate a reply on someone else's gemlog. Sometimes many replies! Sometimes the replies generate their own replies! GUS Threads allow you to visualize and explore these threads within Geminispace. diff --git a/serve/templates/threads.gmi b/serve/templates/threads.gmi @@ -0,0 +1,17 @@ +{% include 'fragments/header.gmi' %} + + +## Threads + +{% for date in threads %} +### {{ date["date"] | datetimeformat("%Y, %b %d") }} + +{% for thread in date["threads"] %} +{% for member in thread["members"] %} +=> {{ member["fetchable_url"] }} [{{ member["first_seen"] | datetimeformat("%b %d") }}] {{member["address"] | threadaddressformat }} {{ member["url"][9:] }} +{% endfor %} +~~~~~~~~~~ +{% endfor %} + +{% endfor %} +{% include 'fragments/footer.gmi' %} diff --git a/serve/views.py b/serve/views.py @@ -20,7 +20,15 @@ template_env = jinja2.Environment( def datetimeformat(value, format="%Y-%m-%d"): return value.strftime(format) + +def threadaddressformat(value): + depth = len(value.split(".")) + if depth > 1: + return " " * (depth - 1) + "↳" + return "" + template_env.filters['datetimeformat'] = datetimeformat +template_env.filters['threadaddressformat'] = threadaddressformat def render_template(name: str, *args, **kwargs) -> str: """ @@ -132,3 +140,12 @@ def backlinks(request): return Response(Status.SUCCESS, "text/gemini", body) else: return Response(Status.INPUT, "Gemini URL") + + +@app.route("/threads") +def threads(request): + threads = gus.get_threads() + body = render_template("threads.gmi", + threads=threads, + index_modification_time=gus.statistics["index_modification_time"]) + return Response(Status.SUCCESS, "text/gemini", body)