build_threads.py (7826B)
1 from peewee import JOIN 2 3 from gus import constants 4 from gus.lib.db_model import init_db, Link, Page, Thread, ThreadPage 5 from gus.lib.gemini import GeminiResource 6 7 collapsible_log_variations = [ 8 ("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/3albums/"), 9 ("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/hitenheroes/"), 10 ("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/cornedbeef/"), 11 ("gemini://gemini.circumlunar.space/~", "gemini://gemini.circumlunar.space/users/"), 12 ("gemini://cetacean.club", "gemini://maj.kahless.cetacean.club"), 13 ] 14 15 16 def find_thread_tops(resource, first_seen, page_id, content, current_chain=[]): 17 """ 18 This function will recursively walk up to the tops of all threads a given 19 page belongs to, then call recurse_thread on each of them to actually build 20 the full threads. 21 """ 22 for collapsible in collapsible_log_variations: 23 if resource.normalized_url.startswith(collapsible[1]): 24 resource = GeminiResource(collapsible[0] + resource.fetchable_url[len(collapsible[1]):]) 25 break 26 u = resource.indexable_url.rstrip("/") 27 parent_pages_query = Page.raw("""SELECT p_to.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen 28 FROM page AS p_from 29 JOIN indexable_crawl AS ic 30 ON ic.page_id == p_to.id 31 JOIN crawl AS c 32 ON c.page_id == p_to.id 33 JOIN link as l 34 ON l.from_page_id == p_from.id 35 JOIN page as p_to 36 ON p_to.id == l.to_page_id 37 WHERE p_from.url IN (?, ?) 38 AND p_to.normalized_url != ? 39 AND c.status == 20 40 AND p_to.content_type LIKE 'text/%' 41 GROUP BY p_to.normalized_url 42 ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_url) 43 found_threadable_parents = False 44 for parent_page in parent_pages_query.iterator(): 45 parent_resource = GeminiResource(parent_page.fetchable_url) 46 for collapsible in collapsible_log_variations: 47 if resource.normalized_url.startswith(collapsible[1]): 48 parent_resource = GeminiResource(collapsible[0] + resource.fetchable_url[len(collapsible[1]):]) 49 break 50 # Skip any parents that are already in the list of seen resources for this call 51 # stack - it means they're circular linking 52 if any(r for r in current_chain if r.normalized_url == resource.normalized_url): 53 continue 54 if is_threadable_link(resource, parent_resource, parent_page.is_cross_host_like): 55 found_threadable_parents = True 56 find_thread_tops( 57 parent_resource, 58 parent_page.first_seen, 59 parent_page.id, 60 parent_page.content, 61 current_chain + [resource]) 62 if not found_threadable_parents: 63 # return early if thread top already processed 64 try: 65 query = ThreadPage.select().join(Page).where(Page.url == resource.indexable_url, ThreadPage.address == "001") 66 query.get() 67 print(f"\nAlready done: {resource.fetchable_url}") 68 return 69 except ThreadPage.DoesNotExist: 70 pass 71 full_thread = recurse_thread(resource, "001", first_seen, page_id, content) 72 73 # Deduplicate 74 full_thread.reverse() 75 i = 0 76 while i < len(full_thread): 77 if any(x for x in full_thread[i+1:] if x[0].normalized_url == full_thread[i][0].normalized_url): 78 full_thread.pop(i) 79 else: 80 i += 1 81 full_thread.reverse() 82 83 thread_updated_at = max(m[2] for m in full_thread) 84 thread = Thread.create(updated_at=thread_updated_at) 85 print() 86 for m in full_thread: 87 ThreadPage.create( 88 thread=thread, 89 page_id=m[3], 90 address=m[1], 91 friendly_author=m[0].get_friendly_author(m[4]), 92 friendly_title=m[0].get_friendly_title(m[4]), 93 ) 94 print(" -> [{:<19}] [{}] {}".format(m[1], m[2], m[0].fetchable_url)) 95 96 97 def recurse_thread(resource, path, first_seen, page_id, content, current_chain=[]): 98 if not resource.is_valid or not resource.is_log_post_like: 99 # if not resource.is_valid: 100 return [] 101 u = resource.indexable_url.rstrip("/") 102 from_urls = [ 103 u, 104 f"{u}/", 105 ] 106 for collapsible in collapsible_log_variations: 107 if resource.normalized_url.startswith(collapsible[1]): 108 new_u = collapsible[0] + resource.indexable_url[len(collapsible[1]):] 109 from_urls.extend([new_u, f"{new_u}/"]) 110 break 111 elif resource.normalized_url.startswith(collapsible[0]): 112 new_u = collapsible[1] + resource.indexable_url[len(collapsible[0]):] 113 from_urls.extend([new_u, f"{new_u}/"]) 114 break 115 children_query = Page.raw("""SELECT p_from.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen 116 FROM page AS p_from 117 JOIN indexable_crawl AS ic 118 ON ic.page_id == p_from.id 119 JOIN crawl AS c 120 ON c.page_id == p_from.id 121 JOIN link as l 122 ON l.from_page_id == p_from.id 123 JOIN page as p_to 124 ON p_to.id == l.to_page_id 125 WHERE p_to.url IN (""" + ", ".join(["?" for x in range(len(from_urls))]) + """) 126 AND p_from.normalized_url != ? 127 AND c.status == 20 128 AND p_from.content_type LIKE 'text/%' 129 GROUP BY p_from.normalized_url 130 ORDER BY l.is_cross_host_like, first_seen ASC""", *from_urls, resource.normalized_url) 131 threadable_child_index = 1 132 new_thread_members = [( 133 resource, 134 path, 135 first_seen, 136 page_id, 137 content, 138 )] 139 processed_collapsed_urls = [] 140 for child in children_query.iterator(): 141 collapsed_url = child.fetchable_url 142 for collapsible in collapsible_log_variations: 143 if child.normalized_url.startswith(collapsible[1]): 144 collapsed_url = collapsible[0] + child.fetchable_url[len(collapsible[1]):] 145 break 146 if collapsed_url in processed_collapsed_urls: 147 continue 148 processed_collapsed_urls.append(collapsed_url) 149 child_resource = GeminiResource(collapsed_url) 150 if is_threadable_link(child_resource, resource, child.is_cross_host_like): 151 # Skip any parents that are already in the list of seen resources for this call 152 # stack - it means they're circular linking 153 if any(r for r in current_chain if r.normalized_url == resource.normalized_url): 154 continue 155 child_path = f"{path:0>3}.{threadable_child_index:03}" 156 new_thread_members.extend(recurse_thread( 157 child_resource, 158 child_path, 159 child.first_seen, 160 child.id, 161 child.content, 162 current_chain + [resource] 163 )) 164 threadable_child_index += 1 165 return new_thread_members 166 167 168 def is_threadable_link(r1, r2, is_cross_host_like): 169 return r1.is_log_post_like and r2.is_log_post_like and is_cross_host_like 170 171 172 def main(): 173 db = init_db(f"index/{constants.DB_FILENAME}") 174 Thread.delete().execute() 175 ThreadPage.delete().execute() 176 pages_query = Page.raw("""SELECT p.*, MIN(c.timestamp) AS first_seen 177 FROM page AS p 178 JOIN indexable_crawl AS ic 179 ON ic.page_id == p.id 180 JOIN crawl AS c 181 ON c.page_id == p.id 182 LEFT JOIN threadpage AS tp 183 ON tp.page_id == p.id 184 WHERE tp.page_id IS NULL 185 AND c.status == 20 186 AND p.content_type LIKE 'text/%' 187 GROUP BY p.normalized_url 188 """) 189 for page in pages_query.iterator(): 190 resource = GeminiResource(page.fetchable_url) 191 if resource.is_valid and resource.is_log_post_like: 192 find_thread_tops(resource, page.first_seen, page.id, page.content) 193 print("\nDone!") 194 195 196 if __name__ == "__main__": 197 main()