build_threads.py - geminispace.info

build_threads.py (7826B)
      1 from peewee import JOIN
      2 
      3 from gus import constants
      4 from gus.lib.db_model import init_db, Link, Page, Thread, ThreadPage
      5 from gus.lib.gemini import GeminiResource
      6 
      7 collapsible_log_variations = [
      8     ("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/3albums/"),
      9     ("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/hitenheroes/"),
     10     ("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/cornedbeef/"),
     11     ("gemini://gemini.circumlunar.space/~", "gemini://gemini.circumlunar.space/users/"),
     12     ("gemini://cetacean.club", "gemini://maj.kahless.cetacean.club"),
     13 ]
     14 
     15 
     16 def find_thread_tops(resource, first_seen, page_id, content, current_chain=[]):
     17     """
     18     This function will recursively walk up to the tops of all threads a given
     19     page belongs to, then call recurse_thread on each of them to actually build
     20     the full threads.
     21     """
     22     for collapsible in collapsible_log_variations:
     23         if resource.normalized_url.startswith(collapsible[1]):
     24             resource = GeminiResource(collapsible[0] + resource.fetchable_url[len(collapsible[1]):])
     25             break
     26     u = resource.indexable_url.rstrip("/")
     27     parent_pages_query = Page.raw("""SELECT p_to.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen
     28 FROM page AS p_from
     29 JOIN indexable_crawl AS ic
     30 ON ic.page_id == p_to.id
     31 JOIN crawl AS c
     32 ON c.page_id == p_to.id
     33 JOIN link as l
     34 ON l.from_page_id == p_from.id
     35 JOIN page as p_to
     36 ON p_to.id == l.to_page_id
     37 WHERE p_from.url IN (?, ?)
     38 AND p_to.normalized_url != ?
     39 AND c.status == 20
     40 AND p_to.content_type LIKE 'text/%'
     41 GROUP BY p_to.normalized_url
     42 ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_url)
     43     found_threadable_parents = False
     44     for parent_page in parent_pages_query.iterator():
     45         parent_resource = GeminiResource(parent_page.fetchable_url)
     46         for collapsible in collapsible_log_variations:
     47             if resource.normalized_url.startswith(collapsible[1]):
     48                 parent_resource = GeminiResource(collapsible[0] + resource.fetchable_url[len(collapsible[1]):])
     49                 break
     50         # Skip any parents that are already in the list of seen resources for this call
     51         # stack - it means they're circular linking
     52         if any(r for r in current_chain if r.normalized_url == resource.normalized_url):
     53             continue
     54         if is_threadable_link(resource, parent_resource, parent_page.is_cross_host_like):
     55             found_threadable_parents = True
     56             find_thread_tops(
     57                 parent_resource,
     58                 parent_page.first_seen,
     59                 parent_page.id,
     60                 parent_page.content,
     61                 current_chain + [resource])
     62     if not found_threadable_parents:
     63         # return early if thread top already processed
     64         try:
     65             query = ThreadPage.select().join(Page).where(Page.url == resource.indexable_url, ThreadPage.address == "001")
     66             query.get()
     67             print(f"\nAlready done: {resource.fetchable_url}")
     68             return
     69         except ThreadPage.DoesNotExist:
     70             pass
     71         full_thread = recurse_thread(resource, "001", first_seen, page_id, content)
     72 
     73         # Deduplicate
     74         full_thread.reverse()
     75         i = 0
     76         while i < len(full_thread):
     77             if any(x for x in full_thread[i+1:] if x[0].normalized_url == full_thread[i][0].normalized_url):
     78                 full_thread.pop(i)
     79             else:
     80                 i += 1
     81         full_thread.reverse()
     82 
     83         thread_updated_at = max(m[2] for m in full_thread)
     84         thread = Thread.create(updated_at=thread_updated_at)
     85         print()
     86         for m in full_thread:
     87             ThreadPage.create(
     88                 thread=thread,
     89                 page_id=m[3],
     90                 address=m[1],
     91                 friendly_author=m[0].get_friendly_author(m[4]),
     92                 friendly_title=m[0].get_friendly_title(m[4]),
     93             )
     94             print(" -> [{:<19}] [{}] {}".format(m[1], m[2], m[0].fetchable_url))
     95 
     96 
     97 def recurse_thread(resource, path, first_seen, page_id, content, current_chain=[]):
     98     if not resource.is_valid or not resource.is_log_post_like:
     99     # if not resource.is_valid:
    100         return []
    101     u = resource.indexable_url.rstrip("/")
    102     from_urls = [
    103         u,
    104         f"{u}/",
    105     ]
    106     for collapsible in collapsible_log_variations:
    107         if resource.normalized_url.startswith(collapsible[1]):
    108             new_u = collapsible[0] + resource.indexable_url[len(collapsible[1]):]
    109             from_urls.extend([new_u, f"{new_u}/"])
    110             break
    111         elif resource.normalized_url.startswith(collapsible[0]):
    112             new_u = collapsible[1] + resource.indexable_url[len(collapsible[0]):]
    113             from_urls.extend([new_u, f"{new_u}/"])
    114             break
    115     children_query = Page.raw("""SELECT p_from.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen
    116 FROM page AS p_from
    117 JOIN indexable_crawl AS ic
    118 ON ic.page_id == p_from.id
    119 JOIN crawl AS c
    120 ON c.page_id == p_from.id
    121 JOIN link as l
    122 ON l.from_page_id == p_from.id
    123 JOIN page as p_to
    124 ON p_to.id == l.to_page_id
    125 WHERE p_to.url IN (""" + ", ".join(["?" for x in range(len(from_urls))]) + """)
    126 AND p_from.normalized_url != ?
    127 AND c.status == 20
    128 AND p_from.content_type LIKE 'text/%'
    129 GROUP BY p_from.normalized_url
    130 ORDER BY l.is_cross_host_like, first_seen ASC""", *from_urls, resource.normalized_url)
    131     threadable_child_index = 1
    132     new_thread_members = [(
    133         resource,
    134         path,
    135         first_seen,
    136         page_id,
    137         content,
    138     )]
    139     processed_collapsed_urls = []
    140     for child in children_query.iterator():
    141         collapsed_url = child.fetchable_url
    142         for collapsible in collapsible_log_variations:
    143             if child.normalized_url.startswith(collapsible[1]):
    144                 collapsed_url = collapsible[0] + child.fetchable_url[len(collapsible[1]):]
    145                 break
    146         if collapsed_url in processed_collapsed_urls:
    147             continue
    148         processed_collapsed_urls.append(collapsed_url)
    149         child_resource = GeminiResource(collapsed_url)
    150         if is_threadable_link(child_resource, resource, child.is_cross_host_like):
    151             # Skip any parents that are already in the list of seen resources for this call
    152             # stack - it means they're circular linking
    153             if any(r for r in current_chain if r.normalized_url == resource.normalized_url):
    154                 continue
    155             child_path = f"{path:0>3}.{threadable_child_index:03}"
    156             new_thread_members.extend(recurse_thread(
    157                 child_resource,
    158                 child_path,
    159                 child.first_seen,
    160                 child.id,
    161                 child.content,
    162                 current_chain + [resource]
    163             ))
    164             threadable_child_index += 1
    165     return new_thread_members
    166 
    167 
    168 def is_threadable_link(r1, r2, is_cross_host_like):
    169     return r1.is_log_post_like and r2.is_log_post_like and is_cross_host_like
    170 
    171 
    172 def main():
    173     db = init_db(f"index/{constants.DB_FILENAME}")
    174     Thread.delete().execute()
    175     ThreadPage.delete().execute()
    176     pages_query = Page.raw("""SELECT p.*, MIN(c.timestamp) AS first_seen
    177 FROM page AS p
    178 JOIN indexable_crawl AS ic
    179 ON ic.page_id == p.id
    180 JOIN crawl AS c
    181 ON c.page_id == p.id
    182 LEFT JOIN threadpage AS tp
    183 ON tp.page_id == p.id
    184 WHERE tp.page_id IS NULL
    185 AND c.status == 20
    186 AND p.content_type LIKE 'text/%'
    187 GROUP BY p.normalized_url
    188 """)
    189     for page in pages_query.iterator():
    190         resource = GeminiResource(page.fetchable_url)
    191         if resource.is_valid and resource.is_log_post_like:
    192             find_thread_tops(resource, page.first_seen, page.id, page.content)
    193     print("\nDone!")
    194 
    195 
    196 if __name__ == "__main__":
    197     main()
	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE