commit 43397bdda337c7a0f31019358bd56eb0ae87a993
parent 5eebbbfc00555da619054e8129ad70bf3de99fd5
Author: Natalie Pendragon <natpen@natpen.net>
Date: Fri, 6 Nov 2020 08:42:57 -0500
Reformat code with Black
Diffstat:
14 files changed, 685 insertions(+), 404 deletions(-)
diff --git a/gus/__init__.py b/gus/__init__.py
@@ -1 +1 @@
-__version__ = '0.1.0'
+__version__ = "0.1.0"
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -13,7 +13,11 @@ from whoosh.index import open_dir
from gus.crawl import EXCLUDED_URL_PREFIXES
from gus.lib.db_model import init_db, Page
-from gus.lib.index_statistics import compute_index_statistics, persist_statistics, log_index_statistics
+from gus.lib.index_statistics import (
+ compute_index_statistics,
+ persist_statistics,
+ log_index_statistics,
+)
from gus.lib.whoosh_extensions import UrlAnalyzer
import gus.lib.logging
@@ -39,42 +43,16 @@ def create_index(index_dir):
# shutil.rmtree(index_dir, ignore_errors=True)
pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True)
schema = Schema(
- url_id=ID(
- unique=True,
- ),
- url=TEXT(
- field_boost=2.0,
- stored=True,
- analyzer=UrlAnalyzer(),
- ),
+ url_id=ID(unique=True,),
+ url=TEXT(field_boost=2.0, stored=True, analyzer=UrlAnalyzer(),),
fetchable_url=STORED(),
- domain=TEXT(
- analyzer=UrlAnalyzer(),
- ),
- port=NUMERIC(
- int,
- 32,
- signed=False,
- stored=True,
- ),
- content_type=TEXT(
- stored=True,
- ),
- charset=ID(
- stored=True,
- ),
- lang=ID(
- stored=True,
- ),
- content=TEXT(
- analyzer=FancyAnalyzer(),
- spelling=True,
- stored=True,
- ),
- prompt=TEXT(
- analyzer=FancyAnalyzer(),
- stored=True,
- ),
+ domain=TEXT(analyzer=UrlAnalyzer(),),
+ port=NUMERIC(int, 32, signed=False, stored=True,),
+ content_type=TEXT(stored=True,),
+ charset=ID(stored=True,),
+ lang=ID(stored=True,),
+ content=TEXT(analyzer=FancyAnalyzer(), spelling=True, stored=True,),
+ prompt=TEXT(analyzer=FancyAnalyzer(), stored=True,),
size=NUMERIC(
int,
# this means GUS will have problems indexing responses over ~2GB
@@ -83,14 +61,9 @@ def create_index(index_dir):
stored=True,
),
backlink_count=NUMERIC(
- int,
- 16, # num bits, so max value is 65k
- signed=False,
- stored=True,
- ),
- indexed_at=DATETIME(
- stored=True,
+ int, 16, signed=False, stored=True, # num bits, so max value is 65k
),
+ indexed_at=DATETIME(stored=True,),
)
index_storage.create_index(schema)
@@ -102,16 +75,23 @@ def index_page(page, indexed_urls):
should_skip = True
break
if should_skip:
- logging.debug('URL prefix matches exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(page.url))
+ logging.debug(
+ "URL prefix matches exclusion list, skipping: %s",
+ gus.lib.logging.strip_control_chars(page.url),
+ )
return False
if page.fetchable_url in indexed_urls:
- logging.debug('Page already indexed, skipping: %s', gus.lib.logging.strip_control_chars(page.url))
+ logging.debug(
+ "Page already indexed, skipping: %s",
+ gus.lib.logging.strip_control_chars(page.url),
+ )
return False
logging.info("Indexing page: %s", gus.lib.logging.strip_control_chars(page.url))
u = page.url.rstrip("/")
- external_backlinks = Page.raw("""SELECT p_from.url
+ external_backlinks = Page.raw(
+ """SELECT p_from.url
FROM page AS p_from
JOIN indexable_crawl AS ic
ON ic.page_id == p_from.id
@@ -121,7 +101,10 @@ JOIN page as p_to
ON p_to.id == l.to_page_id
WHERE p_to.url IN (?, ?)
AND l.is_cross_host_like == 1
-GROUP BY p_from.normalized_url""", u, f"{u}/")
+GROUP BY p_from.normalized_url""",
+ u,
+ f"{u}/",
+ )
backlink_urls = [b.url for b in external_backlinks.execute()]
backlink_count = len(backlink_urls)
@@ -146,9 +129,12 @@ GROUP BY p_from.normalized_url""", u, f"{u}/")
index_writer.add_document(**document)
return True
except:
- logging.warn("Failed to index page: %s", gus.lib.logging.strip_control_chars(page.url))
+ logging.warn(
+ "Failed to index page: %s", gus.lib.logging.strip_control_chars(page.url)
+ )
return False
+
def load_indexed_urls(index_dir):
indexed_urls = []
ix = open_dir(index_dir)
@@ -162,7 +148,9 @@ def load_indexed_urls(index_dir):
def invalidate_recent_results(invalidation_window):
recency_minimum = datetime.now() - timedelta(hours=invalidation_window)
- pages = Page.select().where(Page.indexed_at.is_null(False), Page.indexed_at > recency_minimum)
+ pages = Page.select().where(
+ Page.indexed_at.is_null(False), Page.indexed_at > recency_minimum
+ )
for page in pages:
index_writer.delete_by_term("url_id", page.url, searcher=None)
@@ -183,13 +171,17 @@ def build_index(should_run_destructive=False, invalidation_window=0):
index_writer = ix.writer()
invalidate_recent_results(invalidation_window)
- indexed_urls = [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT)
+ indexed_urls = (
+ [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT)
+ )
- pages = Page.raw("""SELECT p.*, MAX(c.timestamp) AS crawl_timestamp
+ pages = Page.raw(
+ """SELECT p.*, MAX(c.timestamp) AS crawl_timestamp
FROM indexable_crawl AS c
JOIN page AS p
ON p.id == c.page_id
-GROUP BY p.normalized_url""")
+GROUP BY p.normalized_url"""
+ )
i = 0
for page in pages.iterator():
@@ -202,10 +194,10 @@ GROUP BY p.normalized_url""")
# it to flush segments to disk every 5000 documents, which
# should scale well with Geminispace going forward.
if i % 5000 == 0:
- logging.debug('Committing index.')
+ logging.debug("Committing index.")
index_writer.commit()
index_writer = ix.writer()
- logging.debug('Committing index for the last time.')
+ logging.debug("Committing index for the last time.")
index_writer.commit()
index_statistics = compute_index_statistics(db)
@@ -216,7 +208,7 @@ GROUP BY p.normalized_url""")
# shutil.rmtree(INDEX_DIR_CURRENT, ignore_errors=True)
# shutil.move(INDEX_DIR_NEW, INDEX_DIR_CURRENT)
- logging.info('Finished!')
+ logging.info("Finished!")
def main():
@@ -226,7 +218,7 @@ def main():
def parse_args():
- parser = argparse.ArgumentParser(description='Crawl Geminispace.')
+ parser = argparse.ArgumentParser(description="Crawl Geminispace.")
parser.add_argument(
"--destructive",
"-d",
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -155,7 +155,10 @@ CRAWL_DELAYS = {
def index_binary(resource, response):
- logging.debug('Indexing binary for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url))
+ logging.debug(
+ "Indexing binary for: %s",
+ gus.lib.logging.strip_control_chars(resource.indexable_url),
+ )
doc = {
"url": resource.indexable_url,
@@ -171,15 +174,23 @@ def index_binary(resource, response):
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
- existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("binary")
- doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "binary")
+ existing_change_frequency = (
+ existing_page.change_frequency
+ or resource.get_default_change_frequency("binary")
+ )
+ doc["change_frequency"] = resource.increment_change_frequency(
+ existing_change_frequency, "binary"
+ )
page = Page(**doc)
page.save()
return page
def index_redirect(resource):
- logging.debug('Indexing redirect for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url))
+ logging.debug(
+ "Indexing redirect for: %s",
+ gus.lib.logging.strip_control_chars(resource.indexable_url),
+ )
doc = {
"url": resource.indexable_url,
@@ -192,15 +203,23 @@ def index_redirect(resource):
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
- existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("redirect")
- doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "redirect")
+ existing_change_frequency = (
+ existing_page.change_frequency
+ or resource.get_default_change_frequency("redirect")
+ )
+ doc["change_frequency"] = resource.increment_change_frequency(
+ existing_change_frequency, "redirect"
+ )
page = Page(**doc)
page.save()
return page
def index_error(resource, is_temporary):
- logging.debug('Indexing error for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url))
+ logging.debug(
+ "Indexing error for: %s",
+ gus.lib.logging.strip_control_chars(resource.indexable_url),
+ )
category = "temp_error" if is_temporary else "perm_error"
default_change_frequency = resource.get_default_change_frequency(category)
@@ -215,16 +234,22 @@ def index_error(resource, is_temporary):
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
- existing_change_frequency = existing_page.change_frequency or default_change_frequency
- doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, category)
+ existing_change_frequency = (
+ existing_page.change_frequency or default_change_frequency
+ )
+ doc["change_frequency"] = resource.increment_change_frequency(
+ existing_change_frequency, category
+ )
page = Page(**doc)
page.save()
return page
-
def index_prompt(resource, response):
- logging.debug('Indexing prompt for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url))
+ logging.debug(
+ "Indexing prompt for: %s",
+ gus.lib.logging.strip_control_chars(resource.indexable_url),
+ )
doc = {
"url": resource.indexable_url,
@@ -241,15 +266,23 @@ def index_prompt(resource, response):
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
- existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("prompt")
- doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "prompt")
+ existing_change_frequency = (
+ existing_page.change_frequency
+ or resource.get_default_change_frequency("prompt")
+ )
+ doc["change_frequency"] = resource.increment_change_frequency(
+ existing_change_frequency, "prompt"
+ )
page = Page(**doc)
page.save()
return page
def index_content(resource, response):
- logging.debug('Indexing content for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url))
+ logging.debug(
+ "Indexing content for: %s",
+ gus.lib.logging.strip_control_chars(resource.indexable_url),
+ )
doc = {
"url": resource.indexable_url,
@@ -264,7 +297,7 @@ def index_content(resource, response):
"change_frequency": resource.get_default_change_frequency("content"),
}
if response.content_type == "text/gemini":
- doc["lang"] = response.lang or "none",
+ doc["lang"] = (response.lang or "none",)
existing_page = Page.get_or_none(url=resource.indexable_url)
is_different = False
if existing_page:
@@ -272,10 +305,17 @@ def index_content(resource, response):
if existing_page.content:
is_different = doc["content"] != existing_page.content
if is_different:
- doc["change_frequency"] = resource.get_default_change_frequency("content")
+ doc["change_frequency"] = resource.get_default_change_frequency(
+ "content"
+ )
else:
- existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("content")
- doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "content")
+ existing_change_frequency = (
+ existing_page.change_frequency
+ or resource.get_default_change_frequency("content")
+ )
+ doc["change_frequency"] = resource.increment_change_frequency(
+ existing_change_frequency, "content"
+ )
page = Page(**doc)
page.save()
return page, is_different
@@ -301,17 +341,21 @@ def index_links(from_resource, contained_resources):
domain=cr.normalized_host,
port=cr.urlsplit.port or 1965,
)
- data.append({
- "from_page": from_page,
- "to_page": to_page,
- "is_cross_host_like": Link.get_is_cross_host_like(from_resource, cr),
- })
+ data.append(
+ {
+ "from_page": from_page,
+ "to_page": to_page,
+ "is_cross_host_like": Link.get_is_cross_host_like(from_resource, cr),
+ }
+ )
Link.insert_many(data).execute()
def fetch_robots_file(robot_host):
robot_url = urljoin("gemini://{}".format(robot_host), "/robots.txt")
- logging.info('Fetching robots file: %s', gus.lib.logging.strip_control_chars(robot_url))
+ logging.info(
+ "Fetching robots file: %s", gus.lib.logging.strip_control_chars(robot_url)
+ )
rp = GeminiRobotFileParser(robot_url)
rp.read()
@@ -322,30 +366,52 @@ def get_robots_file(robot_host):
return robot_file_map[robot_host]
-def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, redirect_chain=[]):
+def crawl_page(
+ gemini_resource, current_depth, should_check_if_expired=True, redirect_chain=[]
+):
gr = gemini_resource
url = gr.fetchable_url
if max_crawl_depth >= 0 and current_depth > max_crawl_depth:
- logging.warn('Going too deep, skipping: %s', gus.lib.logging.strip_control_chars(url))
+ logging.warn(
+ "Going too deep, skipping: %s", gus.lib.logging.strip_control_chars(url)
+ )
return
if not gemini_resource.is_valid:
- logging.warn('Not a valid gemini resource, skipping: %s', gus.lib.logging.strip_control_chars(url))
+ logging.warn(
+ "Not a valid gemini resource, skipping: %s",
+ gus.lib.logging.strip_control_chars(url),
+ )
return
for excluded_prefix in EXCLUDED_URL_PREFIXES:
if gr.normalized_url.startswith(excluded_prefix):
- logging.info('URL prefix matches exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(url))
+ logging.info(
+ "URL prefix matches exclusion list, skipping: %s",
+ gus.lib.logging.strip_control_chars(url),
+ )
return
for excluded_path in EXCLUDED_URL_PATHS:
if gr.urlsplit.path.lower().endswith(excluded_path):
- logging.info('URL on exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(url))
+ logging.info(
+ "URL on exclusion list, skipping: %s",
+ gus.lib.logging.strip_control_chars(url),
+ )
return
if should_check_if_expired:
existing_page = Page.get_or_none(url=gr.indexable_url)
if existing_page and existing_page.change_frequency is not None:
- most_recent_crawl = Crawl.select(peewee.fn.MAX(Crawl.timestamp)).where(Crawl.page == existing_page).scalar()
- if most_recent_crawl and datetime.now() < most_recent_crawl + timedelta(hours=existing_page.change_frequency):
- logging.debug('Recrawling too soon, skipping: %s', gus.lib.logging.strip_control_chars(gr.fetchable_url))
+ most_recent_crawl = (
+ Crawl.select(peewee.fn.MAX(Crawl.timestamp))
+ .where(Crawl.page == existing_page)
+ .scalar()
+ )
+ if most_recent_crawl and datetime.now() < most_recent_crawl + timedelta(
+ hours=existing_page.change_frequency
+ ):
+ logging.debug(
+ "Recrawling too soon, skipping: %s",
+ gus.lib.logging.strip_control_chars(gr.fetchable_url),
+ )
return
# ROBOTS
@@ -365,120 +431,188 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red
crawl_delay = robots_file.crawl_delay("gus")
if not can_fetch:
- logging.debug('Blocked by robots files, skipping: %s', gus.lib.logging.strip_control_chars(url))
+ logging.debug(
+ "Blocked by robots files, skipping: %s",
+ gus.lib.logging.strip_control_chars(url),
+ )
return
# Crawl delay
if gr.normalized_host in domain_hit_timings:
if gr.normalized_host in CRAWL_DELAYS:
- next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=CRAWL_DELAYS[gr.normalized_host])
+ next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(
+ milliseconds=CRAWL_DELAYS[gr.normalized_host]
+ )
elif not crawl_delay:
- next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=500)
+ next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(
+ milliseconds=500
+ )
else:
- next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=crawl_delay)
+ next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(
+ milliseconds=crawl_delay
+ )
sleep_duration = max((next_allowed_hit - datetime.now()).total_seconds(), 0)
time.sleep(sleep_duration)
domain_hit_timings[gr.normalized_host] = datetime.now()
# Actually fetch!
- logging.info('Fetching resource: %s', gus.lib.logging.strip_control_chars(url))
+ logging.info("Fetching resource: %s", gus.lib.logging.strip_control_chars(url))
if gr.fully_qualified_parent_url is not None:
- logging.debug('with parent: %s', gus.lib.logging.strip_control_chars(gr.fully_qualified_parent_url))
+ logging.debug(
+ "with parent: %s",
+ gus.lib.logging.strip_control_chars(gr.fully_qualified_parent_url),
+ )
response = gr.fetch()
if response is None:
# problem before getting a response
- logging.warn('Failed to fetch: %s', gus.lib.logging.strip_control_chars(url))
+ logging.warn("Failed to fetch: %s", gus.lib.logging.strip_control_chars(url))
page = index_error(gr, True)
- page_crawl = Crawl(page=page,
- status=0,
- is_different=False,
- timestamp=datetime.utcnow())
+ page_crawl = Crawl(
+ page=page, status=0, is_different=False, timestamp=datetime.utcnow()
+ )
page_crawl.save()
elif response.status.startswith("4"):
# temporary error status
- logging.debug('Got temporary error: %s: %s %s',
- gus.lib.logging.strip_control_chars(url),
- response.status,
- response.error_message)
+ logging.debug(
+ "Got temporary error: %s: %s %s",
+ gus.lib.logging.strip_control_chars(url),
+ response.status,
+ response.error_message,
+ )
page = index_error(gr, True)
- page_crawl = Crawl(page=page,
- status=response.status,
- is_different=False,
- error_message=response.error_message,
- timestamp=datetime.utcnow())
+ page_crawl = Crawl(
+ page=page,
+ status=response.status,
+ is_different=False,
+ error_message=response.error_message,
+ timestamp=datetime.utcnow(),
+ )
page_crawl.save()
elif response.status.startswith("5"):
# permanent error status
- logging.debug('Got permanent error: %s: %s %s',
- gus.lib.logging.strip_control_chars(url),
- response.status,
- response.error_message)
+ logging.debug(
+ "Got permanent error: %s: %s %s",
+ gus.lib.logging.strip_control_chars(url),
+ response.status,
+ response.error_message,
+ )
page = index_error(gr, False)
- page_crawl = Crawl(page=page,
- status=response.status,
- is_different=False,
- error_message=response.error_message,
- timestamp=datetime.utcnow())
+ page_crawl = Crawl(
+ page=page,
+ status=response.status,
+ is_different=False,
+ error_message=response.error_message,
+ timestamp=datetime.utcnow(),
+ )
page_crawl.save()
elif response.status.startswith("3"):
# redirect status
- logging.debug('Got redirected: %s: %s %s',
- gus.lib.logging.strip_control_chars(url),
- response.status,
- response.url)
+ logging.debug(
+ "Got redirected: %s: %s %s",
+ gus.lib.logging.strip_control_chars(url),
+ response.status,
+ response.url,
+ )
if len(redirect_chain) > constants.MAXIMUM_REDIRECT_CHAIN_LENGTH:
- logging.info('Aborting, maximum redirect chain length reached: %s', gus.lib.logging.strip_control_chars(url))
+ logging.info(
+ "Aborting, maximum redirect chain length reached: %s",
+ gus.lib.logging.strip_control_chars(url),
+ )
return
- redirect_resource = GeminiResource(response.url, gr.normalized_url, gr.normalized_host)
+ redirect_resource = GeminiResource(
+ response.url, gr.normalized_url, gr.normalized_host
+ )
if redirect_resource.fetchable_url == gr.fetchable_url:
- logging.info('Aborting, redirecting to self: %s', gus.lib.logging.strip_control_chars(url))
+ logging.info(
+ "Aborting, redirecting to self: %s",
+ gus.lib.logging.strip_control_chars(url),
+ )
return
page = index_redirect(gr)
- page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow())
+ page_crawl = Crawl(
+ page=page,
+ status=response.status,
+ is_different=False,
+ timestamp=datetime.utcnow(),
+ )
page_crawl.save()
index_links(gr, [redirect_resource])
- crawl_page(redirect_resource, current_depth, should_check_if_expired=True, redirect_chain=redirect_chain + [gr.fetchable_url])
+ crawl_page(
+ redirect_resource,
+ current_depth,
+ should_check_if_expired=True,
+ redirect_chain=redirect_chain + [gr.fetchable_url],
+ )
elif response.status.startswith("1"):
# input status
- logging.debug('Input requested at: %s: %s %s', gus.lib.logging.strip_control_chars(url), response.status, response.prompt)
+ logging.debug(
+ "Input requested at: %s: %s %s",
+ gus.lib.logging.strip_control_chars(url),
+ response.status,
+ response.prompt,
+ )
page = index_prompt(gr, response)
- page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow())
+ page_crawl = Crawl(
+ page=page,
+ status=response.status,
+ is_different=False,
+ timestamp=datetime.utcnow(),
+ )
page_crawl.save()
elif response.status.startswith("2"):
# success status
- logging.debug('Successful request: %s: %s %s', gus.lib.logging.strip_control_chars(url), response.status, response.content_type)
+ logging.debug(
+ "Successful request: %s: %s %s",
+ gus.lib.logging.strip_control_chars(url),
+ response.status,
+ response.content_type,
+ )
if response.content_type.startswith("text/"):
page, is_different = index_content(gr, response)
page_crawl = Crawl(
page=page,
status=response.status,
is_different=is_different,
- timestamp=datetime.utcnow()
+ timestamp=datetime.utcnow(),
)
page_crawl.save()
if response.content_type != "text/gemini":
- logging.debug('Content is not gemini text: %s: %s',
- gus.lib.logging.strip_control_chars(url), response.content_type)
+ logging.debug(
+ "Content is not gemini text: %s: %s",
+ gus.lib.logging.strip_control_chars(url),
+ response.content_type,
+ )
else:
- logging.debug('Got gemini text, extracting and crawling links: %s',
- gus.lib.logging.strip_control_chars(url))
+ logging.debug(
+ "Got gemini text, extracting and crawling links: %s",
+ gus.lib.logging.strip_control_chars(url),
+ )
contained_resources = gr.extract_contained_resources(response.content)
index_links(gr, contained_resources)
for resource in contained_resources:
- crawl_page(resource, current_depth+1, should_check_if_expired=True)
+ crawl_page(
+ resource, current_depth + 1, should_check_if_expired=True
+ )
else:
page = index_binary(gr, response)
- page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow())
+ page_crawl = Crawl(
+ page=page,
+ status=response.status,
+ is_different=False,
+ timestamp=datetime.utcnow(),
+ )
page_crawl.save()
else:
- logging.warn('Got unhandled status: %s: %s',
- gus.lib.logging.strip_control_chars(url),
- response.status)
+ logging.warn(
+ "Got unhandled status: %s: %s",
+ gus.lib.logging.strip_control_chars(url),
+ response.status,
+ )
def pickle_robot_file_map(robot_file_map, index_dir):
@@ -487,13 +621,14 @@ def pickle_robot_file_map(robot_file_map, index_dir):
def unpickle_robot_file_map(index_dir):
if not os.path.isfile(index_dir + "/robot_file_map.p"):
- logging.debug('Robot file cache missing')
+ logging.debug("Robot file cache missing")
return {}
return pickle.load(open(index_dir + "/robot_file_map.p", "rb"))
def load_expired_urls():
- expired_pages = Page.raw("""SELECT url
+ expired_pages = Page.raw(
+ """SELECT url
FROM (
SELECT p.url, p.normalized_url, p.change_frequency, MAX(c.timestamp) as timestamp
FROM page as p
@@ -502,7 +637,8 @@ FROM (
GROUP BY p.url
)
WHERE datetime(timestamp, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now')
-GROUP BY normalized_url;""")
+GROUP BY normalized_url;"""
+ )
return [page.url for page in expired_pages.execute()]
@@ -528,7 +664,10 @@ def load_feed_urls(filename):
def items_from_feed_string(feed_str):
feed_obj = feedparser.parse(feed_str)
feed = feed_obj.feed
- return [(entry.updated_parsed, entry.link, entry.title, feed.title) for entry in feed_obj.entries]
+ return [
+ (entry.updated_parsed, entry.link, entry.title, feed.title)
+ for entry in feed_obj.entries
+ ]
def resolve_feed_content_urls(feed_file=constants.FEED_FILE):
@@ -550,26 +689,29 @@ def resolve_feed_content_urls(feed_file=constants.FEED_FILE):
now = time.time()
interval = int(now - last)
if interval < 5:
- logging.warn('Declining to hit %s again after only %d seconds',
- gus.lib.logging.strip_control_chars(feed_resource.normalized_host),
- interval)
+ logging.warn(
+ "Declining to hit %s again after only %d seconds",
+ gus.lib.logging.strip_control_chars(feed_resource.normalized_host),
+ interval,
+ )
feed_urls.insert(0, feed_url)
skips += 1
if skips == len(feed_urls):
# We've hammered every server in the queue! Sleep a bit...
- logging.warn('Sleeping to give all servers a rest!')
+ logging.warn("Sleeping to give all servers a rest!")
time.sleep(5)
continue
skips = 0
# Good to go
- logging.info('Fetching feed: %s',
- gus.lib.logging.strip_control_chars(feed_url))
+ logging.info("Fetching feed: %s", gus.lib.logging.strip_control_chars(feed_url))
try:
resp = feed_resource.fetch()
except:
- logging.info('Error fetching feed, skipping: %s',
- gus.lib.logging.strip_control_chars(feed_url))
+ logging.info(
+ "Error fetching feed, skipping: %s",
+ gus.lib.logging.strip_control_chars(feed_url),
+ )
continue
if resp and resp.status == "20":
last_accessed[feed_resource.normalized_host] = time.time()
@@ -595,8 +737,10 @@ def recrawl_feeds():
crawl_page(resource, 0)
pickle_robot_file_map(robot_file_map, index_dir)
- logging.debug('Recrawled feeds: %s', gus.lib.logging.strip_control_chars(content_urls))
- logging.info('Finished!')
+ logging.debug(
+ "Recrawled feeds: %s", gus.lib.logging.strip_control_chars(content_urls)
+ )
+ logging.info("Finished!")
def run_crawl(should_run_destructive=False, seed_urls=[]):
@@ -609,7 +753,9 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
global robot_file_map
- robot_file_map = {} if should_run_destructive else unpickle_robot_file_map(INDEX_DIR_CURRENT)
+ robot_file_map = (
+ {} if should_run_destructive else unpickle_robot_file_map(INDEX_DIR_CURRENT)
+ )
global domain_hit_timings
domain_hit_timings = {}
global max_crawl_depth
@@ -623,7 +769,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
crawl_page(resource, 0, should_check_if_expired=True)
pickle_robot_file_map(robot_file_map, index_dir)
- logging.info('Finished!')
+ logging.info("Finished!")
def main():
@@ -637,7 +783,7 @@ def main():
def parse_args():
- parser = argparse.ArgumentParser(description='Crawl Geminispace.')
+ parser = argparse.ArgumentParser(description="Crawl Geminispace.")
parser.add_argument(
"--destructive",
"-d",
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -12,6 +12,7 @@ from peewee import (
from gus.lib.gemini import GeminiResource
+
def init_db(filename=":memory:"):
"""
Bind an SQLite database to the Peewee ORM models.
@@ -20,13 +21,15 @@ def init_db(filename=":memory:"):
db = SqliteDatabase(filename)
db.bind(models)
db.create_tables(models)
- db.execute_sql("""CREATE VIEW IF NOT EXISTS indexable_crawl AS
+ db.execute_sql(
+ """CREATE VIEW IF NOT EXISTS indexable_crawl AS
SELECT c.* FROM (
SELECT crawl.*, row_number()
OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS rank
FROM crawl) AS c
WHERE c.rank < 3
-AND c.status == 20;""")
+AND c.status == 20;"""
+ )
return db
@@ -46,33 +49,36 @@ class Page(Model):
lang = TextField(null=True)
content = TextField(null=True)
prompt = TextField(null=True)
- size = IntegerField(null=True) # in bytes
- change_frequency = IntegerField(null=True) # in hours
+ size = IntegerField(null=True) # in bytes
+ change_frequency = IntegerField(null=True) # in hours
indexed_at = DateTimeField(null=True)
+
class Link(Model):
"""
Hyperlinks between pages in Geminispace
"""
- from_page = ForeignKeyField(Page, backref="outbound_links", on_delete='CASCADE')
- to_page = ForeignKeyField(Page, backref="backlinks", on_delete='CASCADE')
+ from_page = ForeignKeyField(Page, backref="outbound_links", on_delete="CASCADE")
+ to_page = ForeignKeyField(Page, backref="backlinks", on_delete="CASCADE")
is_cross_host_like = BooleanField()
def get_is_cross_host_like(from_resource, to_resource):
return from_resource.normalized_host_like != to_resource.normalized_host_like
+
class Crawl(Model):
"""
Attempts to crawl a page.
"""
- page = ForeignKeyField(Page, backref="crawls", on_delete='CASCADE')
+ page = ForeignKeyField(Page, backref="crawls", on_delete="CASCADE")
status = IntegerField()
error_message = TextField(null=True)
is_different = BooleanField()
timestamp = DateTimeField()
+
class Search(Model):
"""
A log of performed searches
@@ -81,19 +87,22 @@ class Search(Model):
query = TextField()
timestamp = DateTimeField()
+
class Thread(Model):
"""
Thread definitions.
"""
+
updated_at = DateTimeField()
+
class ThreadPage(Model):
"""
Mapping table of threads to their member pages.
"""
thread = ForeignKeyField(Thread, backref="pages", on_delete="CASCADE")
- page = ForeignKeyField(Page, backref="threads", on_delete='CASCADE')
+ page = ForeignKeyField(Page, backref="threads", on_delete="CASCADE")
address = TextField()
friendly_author = TextField()
friendly_title = TextField()
diff --git a/gus/lib/domain.py b/gus/lib/domain.py
@@ -1,5 +1,6 @@
import re
+
def is_domain(possible_domain):
domain_pattern = "^((?=[a-z0-9-]{1,63}\.)(xn--)?[a-z0-9]+(-[a-z0-9]+)*\.)+(aaa|aarp|abarth|abb|abbott|abbvie|abc|able|abogado|abudhabi|ac|academy|accenture|accountant|accountants|aco|active|ac|or|ad|adac|ads|adult|ae|aeg|aero|aetna|af|afamilycompany|afl|africa|ag|agakhan|agency|ai|aig|aigo|airbus|airforce|airtel|akdn|al|alfaromeo|alibaba|alipay|allfinanz|allstate|ally|alsace|alstom|am|americanexpress|americanfamily|amex|amfam|amica|amsterdam|an|analytics|android|anquan|anz|ao|aol|apartments|app|apple|aq|aquarelle|ar|arab|aramco|archi|army|arpa|art|arte|as|asda|asia|associates|at|athleta|attorney|au|auction|audi|audible|audio|auspost|author|auto|autos|avianca|aw|aws|ax|axa|az|azure|ba|baby|baidu|banamex|bananarepublic|band|bank|bar|barcelona|barclaycard|barclays|barefoot|bargains|baseball|basketball|bauhaus|bayern|bb|bbc|bbt|bbva|bcg|bcn|bd|be|beats|beauty|beer|bentley|berlin|best|bestbuy|bet|bf|bg|bh|bharti|bi|bible|bid|bike|bing|bingo|bio|biz|bj|bl|black|blackfriday|blanco|blockbuster|blog|bloomberg|blue|bm|bms|bmw|bn|bnl|bnpparibas|bo|boats|boehringer|bofa|bom|bond|boo|book|booking|boots|bosch|bostik|boston|bot|boutique|box|bq|br|bradesco|bridgestone|broadway|broker|brother|brussels|bs|bt|budapest|bugatti|build|builders|business|buy|buzz|bv|bw|by|bz|bzh|ca|cab|cafe|cal|call|calvinklein|cam|camera|camp|cancerresearch|canon|capetown|capital|capitalone|car|caravan|cards|care|career|careers|cars|cartier|casa|case|caseih|cash|casino|cat|catering|catholic|cba|cbn|cbre|cbs|cc|cd|ceb|center|ceo|cern|cf|cfa|cfd|cg|ch|chanel|channel|charity|chase|chat|cheap|chintai|chloe|christmas|chrome|chrysler|church|ci|cipriani|circle|cisco|citadel|citi|citic|city|cityeats|ck|cl|claims|cleaning|click|clinic|clinique|clothing|cloud|club|clubmed|cm|cn|co|coach|codes|coffee|college|cologne|com|comcast|commbank|community|company|compare|computer|comsec|condos|construction|consulting|contact|contractors|cooking|cookingchannel|cool|coop|corsica|country|coupon|coupons|courses|cr|credit|creditcard|creditunion|cricket|crown|crs|cruise|cruises|csc|cu|cuisinella|cv|cw|cx|cy|cymru|cyou|cz|dabur|dad|dance|data|date|dating|datsun|day|dclk|dds|de|deal|dealer|deals|degree|delivery|dell|deloitte|delta|democrat|dental|dentist|desi|design|dev|dhl|diamonds|diet|digital|direct|directory|discount|discover|dish|diy|dj|dk|dm|dnp|do|docs|doctor|dodge|dog|doha|domains|doosan|dot|download|drive|dtv|dubai|duck|dunlop|duns|dupont|durban|dvag|dvr|dz|earth|eat|ec|eco|edeka|edu|education|ee|eg|eh|email|emerck|energy|engineer|engineering|enterprises|epost|epson|equipment|er|ericsson|erni|es|esq|estate|esurance|et|etisalat|eu|eurovision|eus|events|everbank|exchange|expert|exposed|express|extraspace|fage|fail|fairwinds|faith|family|fan|fans|farm|farmers|fashion|fast|fedex|feedback|ferrari|ferrero|fi|fiat|fidelity|fido|film|final|finance|financial|fire|firestone|firmdale|fish|fishing|fit|fitness|fj|fk|flickr|flights|flir|florist|flowers|flsmidth|fly|fm|fo|foo|food|foodnetwork|football|ford|forex|forsale|forum|foundation|fox|fr|free|fresenius|frl|frogans|frontdoor|frontier|ftr|fujitsu|fujixerox|fun|fund|furniture|futbol|fyi|ga|gal|gallery|gallo|gallup|game|games|gap|garden|gb|gbiz|gd|gdn|ge|gea|gent|genting|george|gf|gg|ggee|gh|gi|gift|gifts|gives|giving|gl|glade|glass|gle|global|globo|gm|gmail|gmbh|gmo|gmx|gn|godaddy|gold|goldpoint|golf|goo|goodhands|goodyear|goog|google|gop|got|gov|gp|gq|gr|grainger|graphics|gratis|green|gripe|grocery|group|gs|gt|gu|guardian|gucci|guge|guide|guitars|guru|gw|gy|hair|hamburg|hangout|haus|hbo|hdfc|hdfcbank|health|healthcare|help|helsinki|here|hermes|hgtv|hiphop|hisamitsu|hitachi|hiv|hk|hkt|hm|hn|hockey|holdings|holiday|homedepot|homegoods|homes|homesense|honda|honeywell|horse|hospital|host|hosting|hot|hoteles|hotels|hotmail|house|how|hr|hsbc|ht|htc|hu|hughes|hyatt|hyundai|ibm|icbc|ice|icu|id|ie|ieee|ifm|iinet|ikano|il|im|imamat|imdb|immo|immobilien|in|industries|infiniti|info|ing|ink|institute|insurance|insure|int|intel|international|intuit|investments|io|ipiranga|iq|ir|irish|is|iselect|ismaili|ist|istanbul|it|itau|itv|iveco|iwc|jaguar|java|jcb|jcp|je|jeep|jetzt|jewelry|jio|jlc|jll|jm|jmp|jnj|jo|jobs|joburg|jot|joy|jp|jpmorgan|jprs|juegos|juniper|kaufen|kddi|ke|kerryhotels|kerrylogistics|kerryproperties|kfh|kg|kh|ki|kia|kim|kinder|kindle|kitchen|kiwi|km|kn|koeln|komatsu|kosher|kp|kpmg|kpn|kr|krd|kred|kuokgroup|kw|ky|kyoto|kz|la|lacaixa|ladbrokes|lamborghini|lamer|lancaster|lancia|lancome|land|landrover|lanxess|lasalle|lat|latino|latrobe|law|lawyer|lb|lc|lds|lease|leclerc|lefrak|legal|lego|lexus|lgbt|li|liaison|lidl|life|lifeinsurance|lifestyle|lighting|like|lilly|limited|limo|lincoln|linde|link|lipsy|live|living|lixil|lk|llc|loan|loans|locker|locus|loft|lol|london|lotte|lotto|love|lpl|lplfinancial|lr|ls|lt|ltd|ltda|lu|lundbeck|lupin|luxe|luxury|lv|ly|ma|macys|madrid|maif|maison|makeup|man|management|mango|map|market|marketing|markets|marriott|marshalls|maserati|mattel|mba|mc|mcd|mcdonalds|mckinsey|md|me|med|media|meet|melbourne|meme|memorial|men|menu|meo|merckmsd|metlife|mf|mg|mh|miami|microsoft|mil|mini|mint|mit|mitsubishi|mk|ml|mlb|mls|mm|mma|mn|mo|mobi|mobile|mobily|moda|moe|moi|mom|monash|money|monster|montblanc|mopar|mormon|mortgage|moscow|moto|motorcycles|mov|movie|movistar|mp|mq|mr|ms|msd|mt|mtn|mtpc|mtr|mu|museum|mutual|mutuelle|mv|mw|mx|my|mz|na|nab|nadex|nagoya|name|nationwide|natura|navy|nba|nc|ne|nec|net|netbank|netflix|network|neustar|new|newholland|news|next|nextdirect|nexus|nf|nfl|ng|ngo|nhk|ni|nico|nike|nikon|ninja|nissan|nissay|nl|no|nokia|northwesternmutual|norton|now|nowruz|nowtv|np|nr|nra|nrw|ntt|nu|nyc|nz|obi|observer|off|office|okinawa|olayan|olayangroup|oldnavy|ollo|om|omega|one|ong|onl|online|onyourside|ooo|open|oracle|orange|org|organic|orientexpress|origins|osaka|otsuka|ott|ovh|pa|page|pamperedchef|panasonic|panerai|paris|pars|partners|parts|party|passagens|pay|pccw|pe|pet|pf|pfizer|pg|ph|pharmacy|phd|philips|phone|photo|photography|photos|physio|piaget|pics|pictet|pictures|pid|pin|ping|pink|pioneer|pizza|pk|pl|place|play|playstation|plumbing|plus|pm|pn|pnc|pohl|poker|politie|porn|post|pr|pramerica|praxi|press|prime|pro|prod|productions|prof|progressive|promo|properties|property|protection|pru|prudential|ps|pt|pub|pw|pwc|py|qa|qpon|quebec|quest|qvc|racing|radio|raid|re|read|realestate|realtor|realty|recipes|red|redstone|redumbrella|rehab|reise|reisen|reit|reliance|ren|rent|rentals|repair|report|republican|rest|restaurant|review|reviews|rexroth|rich|richardli|ricoh|rightathome|ril|rio|rip|rmit|ro|rocher|rocks|rodeo|rogers|room|rs|rsvp|ru|rugby|ruhr|run|rw|rwe|ryukyu|sa|saarland|safe|safety|sakura|sale|salon|samsclub|samsung|sandvik|sandvikcoromant|sanofi|sap|sapo|sarl|sas|save|saxo|sb|sbi|sbs|sc|sca|scb|schaeffler|schmidt|scholarships|school|schule|schwarz|science|scjohnson|scor|scot|sd|se|search|seat|secure|security|seek|select|sener|services|ses|seven|sew|sex|sexy|sfr|sg|sh|shangrila|sharp|shaw|shell|shia|shiksha|shoes|shop|shopping|shouji|show|showtime|shriram|si|silk|sina|singles|site|sj|sk|ski|skin|sky|skype|sl|sling|sm|smart|smile|sn|sncf|so|soccer|social|softbank|software|sohu|solar|solutions|song|sony|soy|space|spiegel|sport|spot|spreadbetting|sr|srl|srt|ss|st|stada|staples|star|starhub|statebank|statefarm|statoil|stc|stcgroup|stockholm|storage|store|stream|studio|study|style|su|sucks|supplies|supply|support|surf|surgery|suzuki|sv|swatch|swiftcover|swiss|sx|sy|sydney|symantec|systems|sz|tab|taipei|talk|taobao|target|tatamotors|tatar|tattoo|tax|taxi|tc|tci|td|tdk|team|tech|technology|tel|telecity|telefonica|temasek|tennis|teva|tf|tg|th|thd|theater|theatre|tiaa|tickets|tienda|tiffany|tips|tires|tirol|tj|tjmaxx|tjx|tk|tkmaxx|tl|tm|tmall|tn|to|today|tokyo|tools|top|toray|toshiba|total|tours|town|toyota|toys|tp|tr|trade|trading|training|travel|travelchannel|travelers|travelersinsurance|trust|trv|tt|tube|tui|tunes|tushu|tv|tvs|tw|tz|ua|ubank|ubs|uconnect|ug|uk|um|unicom|university|uno|uol|ups|us|uy|uz|va|vacations|vana|vanguard|vc|ve|vegas|ventures|verisign|versicherung|vet|vg|vi|viajes|video|vig|viking|villas|vin|vip|virgin|visa|vision|vista|vistaprint|viva|vivo|vlaanderen|vn|vodka|volkswagen|volvo|vote|voting|voto|voyage|vu|vuelos|wales|walmart|walter|wang|wanggou|warman|watch|watches|weather|weatherchannel|webcam|weber|website|wed|wedding|weibo|weir|wf|whoswho|wien|wiki|williamhill|win|windows|wine|winners|wme|wolterskluwer|woodside|work|works|world|wow|ws|wtc|wtf|xbox|xerox|xfinity|xihuan|xin|测试|कॉम|परीक्षा|セール|佛山|ಭಾರತ|慈善|集团|在线|한국|ଭାରତ|大众汽车|点看|คอม|ভাৰত|ভারত|八卦|موقع|বাংলা|公益|公司|香格里拉|网站|移动|我爱你|москва|испытание|қаз|католик|онлайн|сайт|联通|срб|бг|бел|קום|时尚|微博|테스트|淡马锡|ファッション|орг|नेट|ストア|삼성|சிங்கப்பூர்|商标|商店|商城|дети|мкд|טעסט|ею|ポイント|新闻|工行|家電|كوم|中文网|中信|中国|中國|娱乐|谷歌|భారత్|ලංකා|電訊盈科|购物|測試|クラウド|ભારત|通販|भारतम्|भारत|भारोत|آزمایشی|பரிட்சை|网店|संगठन|餐厅|网络|ком|укр|香港|诺基亚|食品|δοκιμή|飞利浦|إختبار|台湾|台灣|手表|手机|мон|الجزائر|عمان|ارامكو|ایران|العليان|اتصالات|امارات|بازار|موريتانيا|پاکستان|الاردن|موبايلي|بارت|بھارت|المغرب|ابوظبي|السعودية|ڀارت|كاثوليك|سودان|همراه|عراق|مليسيا|澳門|닷컴|政府|شبكة|بيتك|عرب|გე|机构|组织机构|健康|ไทย|سورية|招聘|рус|рф|珠宝|تونس|大拿|みんな|グーグル|ελ|世界|書籍|ഭാരതം|ਭਾਰਤ|网址|닷넷|コム|天主教|游戏|vermögensberater|vermögensberatung|企业|信息|嘉里大酒店|嘉里|مصر|قطر|广东|இலங்கை|இந்தியா|հայ|新加坡|فلسطين|テスト|政务|xperia|xxx|xyz|yachts|yahoo|yamaxun|yandex|ye|yodobashi|yoga|yokohama|you|youtube|yt|yun|za|zappos|zara|zero|zip|zippo|zm|zone|zuerich|zw)$"
domain_match = re.match(domain_pattern, possible_domain, re.I)
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -1,5 +1,12 @@
import re
-from urllib.parse import unquote, urljoin, urlsplit, urlunsplit, uses_relative, uses_netloc
+from urllib.parse import (
+ unquote,
+ urljoin,
+ urlsplit,
+ urlunsplit,
+ uses_relative,
+ uses_netloc,
+)
from urllib.robotparser import RobotFileParser
import gusmobile
@@ -12,22 +19,47 @@ from gus.lib.domain import is_domain
uses_relative.append("gemini")
uses_netloc.append("gemini")
-LOG_ROOT_LIKE_PATTERN = re.compile(".*/(gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/?$", flags=re.IGNORECASE)
-LOG_POST_LIKE_PATTERN = re.compile(".*/((gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/.+$|.*/\d{4}[-_]\d{2}[-_]\d{2}.*$|.*/(19|20)\d{6}.*$)", flags=re.IGNORECASE)
-LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(".*/(games|archive|archives|rss|handlers|diagnostics)/.*|.*atom.xml$|.*gemlog.gmi$|.*index.gmi$|.*index.gemini$", flags=re.IGNORECASE)
-LOG_POST_GEMLOGBLUE_LIKE_PATTERN = re.compile("^/users/[a-z][-a-z0-9]*/\d+\.gmi?", flags=re.IGNORECASE)
-LOG_POST_BOSTON_LIKE_PATTERN = re.compile("^/boston/\d{4}/\d{2}/\d+\.\d+", flags=re.IGNORECASE)
-
-ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE)
-ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE)
+LOG_ROOT_LIKE_PATTERN = re.compile(
+ ".*/(gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/?$",
+ flags=re.IGNORECASE,
+)
+LOG_POST_LIKE_PATTERN = re.compile(
+ ".*/((gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/.+$|.*/\d{4}[-_]\d{2}[-_]\d{2}.*$|.*/(19|20)\d{6}.*$)",
+ flags=re.IGNORECASE,
+)
+LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(
+ ".*/(games|archive|archives|rss|handlers|diagnostics)/.*|.*atom.xml$|.*gemlog.gmi$|.*index.gmi$|.*index.gemini$",
+ flags=re.IGNORECASE,
+)
+LOG_POST_GEMLOGBLUE_LIKE_PATTERN = re.compile(
+ "^/users/[a-z][-a-z0-9]*/\d+\.gmi?", flags=re.IGNORECASE
+)
+LOG_POST_BOSTON_LIKE_PATTERN = re.compile(
+ "^/boston/\d{4}/\d{2}/\d+\.\d+", flags=re.IGNORECASE
+)
+
+ROOT_LIKE_ONLY_PATTERN = re.compile(
+ "^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE
+)
+ROOT_LIKE_PATTERN = re.compile(
+ "^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE
+)
PIKKULOG_LIKE_PATTERN = re.compile(".*/pikkulog/.*", flags=re.IGNORECASE)
-AUTHOR_URL_PATTERN = re.compile("^/~([a-z][-a-z0-9]*)/|^/users/~?([a-z][-a-z0-9]*)", flags=re.IGNORECASE)
-AUTHOR_CONTENT_PATTERN = re.compile(".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE)
+AUTHOR_URL_PATTERN = re.compile(
+ "^/~([a-z][-a-z0-9]*)/|^/users/~?([a-z][-a-z0-9]*)", flags=re.IGNORECASE
+)
+AUTHOR_CONTENT_PATTERN = re.compile(
+ ".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE
+)
TITLE_CONTENT_PATTERN = re.compile("^#\s(.*)$", flags=re.IGNORECASE | re.MULTILINE)
-TITLE_URL_PATTERN = re.compile(".*/(\d{8}[-_]|\d{4}[-_]\d{2}[-_]\d{2}[-_])?([a-z0-9-_]+)(\.[a-z0-9]+)$", flags=re.IGNORECASE)
+TITLE_URL_PATTERN = re.compile(
+ ".*/(\d{8}[-_]|\d{4}[-_]\d{2}[-_]\d{2}[-_])?([a-z0-9-_]+)(\.[a-z0-9]+)$",
+ flags=re.IGNORECASE,
+)
+
class GeminiRobotFileParser(RobotFileParser):
def set_url(self, url):
@@ -36,7 +68,6 @@ class GeminiRobotFileParser(RobotFileParser):
u, _ = GeminiResource.urlsplit_featureful(url)
self.host, self.path = u[1:3]
-
def read(self):
"""Reads the robots.txt URL and feeds it to the parser."""
gr = GeminiResource(self.url)
@@ -50,7 +81,7 @@ class GeminiRobotFileParser(RobotFileParser):
self.parse(response.content.splitlines())
-class GeminiResource():
+class GeminiResource:
def __init__(self, url, fully_qualified_parent_url=None, parent_hostname=None):
self.raw_url = url
self.urlsplit, self.is_relative = GeminiResource.urlsplit_featureful(
@@ -80,7 +111,7 @@ class GeminiResource():
# things behind the scenes.
is_relative = False
- u = urlsplit(url, 'gemini')
+ u = urlsplit(url, "gemini")
if u.scheme != "gemini":
return None, None
if u.hostname is None:
@@ -89,9 +120,9 @@ class GeminiResource():
if parent_hostname is None:
return None, None
joined = urljoin("gemini://{}".format(parent_hostname), url)
- u = urlsplit(joined, 'gemini')
+ u = urlsplit(joined, "gemini")
is_relative = True
- else: # url does not start with /
+ else: # url does not start with /
# could be: blah.com/test
# could be: test
url_split = url.split("/")
@@ -99,33 +130,36 @@ class GeminiResource():
# prepend with "gemini://" so built-in urlsplit will extract
# the host properly, and continue on
url = "gemini://{}".format(url)
- u = urlsplit(url, 'gemini')
+ u = urlsplit(url, "gemini")
else:
# process relative link
if fully_qualified_parent_url is None:
return None, None
joined = urljoin(fully_qualified_parent_url, url)
- u = urlsplit(joined, 'gemini')
+ u = urlsplit(joined, "gemini")
is_relative = True
return u, is_relative
-
def _get_normalized_url(self):
if not self.is_valid:
return None
if self._normalized_url is None:
- self._normalized_url, self._normalized_host = self._get_normalized_url_and_host()
+ (
+ self._normalized_url,
+ self._normalized_host,
+ ) = self._get_normalized_url_and_host()
return self._normalized_url
-
def _get_normalized_host(self):
if not self.is_valid:
return None
if self._normalized_host is None:
- self._normalized_url, self._normalized_host = self._get_normalized_url_and_host()
+ (
+ self._normalized_url,
+ self._normalized_host,
+ ) = self._get_normalized_url_and_host()
return self._normalized_host
-
def _get_normalized_host_like(self):
if not self.is_valid:
return None
@@ -137,7 +171,6 @@ class GeminiResource():
self._normalized_host_like = normalized_host_like
return self._normalized_host_like
-
def _get_fetchable_url(self):
if not self.is_valid:
return None
@@ -162,27 +195,32 @@ class GeminiResource():
self._fetchable_url = url
return self._fetchable_url
-
def _get_indexable_url(self):
if not self.is_valid:
return None
if self._indexable_url is None:
indexable_url = unquote(self.fetchable_url)
if self.urlsplit.port == 1965:
- indexable_url = self.normalized_url.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1)
+ indexable_url = self.normalized_url.replace(
+ self.urlsplit.hostname.lower() + ":1965",
+ self.urlsplit.hostname.lower(),
+ 1,
+ )
self._indexable_url = indexable_url
return self._indexable_url
-
def _get_is_root_like(self):
if self._is_root_like is None:
is_root_like = False
- if self.urlsplit.path == "" or self.urlsplit.path == "/" or ROOT_LIKE_ONLY_PATTERN.match(self.urlsplit.path):
+ if (
+ self.urlsplit.path == ""
+ or self.urlsplit.path == "/"
+ or ROOT_LIKE_ONLY_PATTERN.match(self.urlsplit.path)
+ ):
is_root_like = True
self._is_root_like = is_root_like
return self._is_root_like
-
def _get_is_pikkulog_like(self):
if self._is_pikkulog_like is None:
is_pikkulog_like = False
@@ -192,30 +230,39 @@ class GeminiResource():
self._is_pikkulog_like = is_pikkulog_like
return self._is_pikkulog_like
-
def _get_is_log_root_like(self):
if self._is_log_root_like is None:
is_log_root_like = False
- if self.urlsplit.path == "" or self.urlsplit.path == "/" or LOG_ROOT_LIKE_PATTERN.match(self.urlsplit.path):
+ if (
+ self.urlsplit.path == ""
+ or self.urlsplit.path == "/"
+ or LOG_ROOT_LIKE_PATTERN.match(self.urlsplit.path)
+ ):
is_log_root_like = True
self._is_log_root_like = is_log_root_like
return self._is_log_root_like
-
def _get_is_log_post_like(self):
if self._is_log_post_like is None:
is_log_post_like = False
post_like_match = LOG_POST_LIKE_PATTERN.match(self.urlsplit.path)
- post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match(self.urlsplit.path)
- post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match(self.urlsplit.path)
+ post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match(
+ self.urlsplit.path
+ )
+ post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match(
+ self.urlsplit.path
+ )
post_boston_match = LOG_POST_BOSTON_LIKE_PATTERN.match(self.urlsplit.path)
- if (post_like_match and not post_like_exclusion_match) or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match) or (self.normalized_host == "gemini.conman.org" and post_boston_match):
+ if (
+ (post_like_match and not post_like_exclusion_match)
+ or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match)
+ or (self.normalized_host == "gemini.conman.org" and post_boston_match)
+ ):
is_log_post_like = True
self._is_log_post_like = is_log_post_like
return self._is_log_post_like
-
def get_friendly_author(self, content):
if not self.is_valid:
return None
@@ -238,7 +285,6 @@ class GeminiResource():
friendly_author = self.normalized_host
return friendly_author
-
def get_friendly_title(self, content):
if not self.is_valid:
return None
@@ -253,13 +299,18 @@ class GeminiResource():
# if no content match, try looking in URL
title_url_match = TITLE_URL_PATTERN.match(self.urlsplit.path)
if title_url_match:
- friendly_title = title_url_match[2].replace("-", " ").replace("_", " ").strip().title()
+ friendly_title = (
+ title_url_match[2]
+ .replace("-", " ")
+ .replace("_", " ")
+ .strip()
+ .title()
+ )
if friendly_title is None:
# if still no match, use URL path
friendly_title = self.urlsplit.path.lstrip("/")
return friendly_title
-
def get_default_change_frequency(self, category):
if not self.is_valid:
return None
@@ -287,7 +338,6 @@ class GeminiResource():
self._default_change_frequency = change_frequency
return self._default_change_frequency
-
def increment_change_frequency(self, existing_change_frequency, category):
if category == "content":
if self.is_root_like or self.is_log_root_like:
@@ -309,7 +359,6 @@ class GeminiResource():
else:
raise Exception.NameError("Unrecognized resource category")
-
# constructed from fetchable_url
# does not matter if quoted or unquoted so I choose arbitrarily to
# standardize on unquoting it.
@@ -333,15 +382,17 @@ class GeminiResource():
# and a server redirecting to the same URL _with_ a trailing slash.
return gusmobile.fetch(self.fetchable_url)
-
def _get_normalized_url_and_host(self):
url_normalized = unquote(self.fetchable_url.lower().rstrip("/"))
if self.urlsplit.port == 1965:
- url_normalized = url_normalized.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1)
+ url_normalized = url_normalized.replace(
+ self.urlsplit.hostname.lower() + ":1965",
+ self.urlsplit.hostname.lower(),
+ 1,
+ )
host_normalized = self.urlsplit.hostname.lower()
return url_normalized, host_normalized
-
def extract_contained_resources(self, content):
# this finds all gemini URLs within the content of a given GeminiResource and
# returns them as a list of new GeminiResources
@@ -349,9 +400,13 @@ class GeminiResource():
return self.contained_resources
link_pattern = "^=>\s*(\S+)"
- preformat_pattern = r'^```.*?^```'
- content_without_preformat = re.sub(preformat_pattern, '', content, flags=re.DOTALL | re.MULTILINE)
- probable_urls = re.findall(link_pattern, content_without_preformat, re.MULTILINE)
+ preformat_pattern = r"^```.*?^```"
+ content_without_preformat = re.sub(
+ preformat_pattern, "", content, flags=re.DOTALL | re.MULTILINE
+ )
+ probable_urls = re.findall(
+ link_pattern, content_without_preformat, re.MULTILINE
+ )
resources = []
for url in probable_urls:
resource = GeminiResource(
diff --git a/gus/lib/logging.py b/gus/lib/logging.py
@@ -7,11 +7,11 @@ def add_arguments(parser):
"""Add arguments to the given argument argparse parser."""
parser.add_argument(
- '--logging-config',
- '-c',
- dest='logging_ini_fname',
+ "--logging-config",
+ "-c",
+ dest="logging_ini_fname",
default=False,
- help='Location of logging configuration file'
+ help="Location of logging configuration file",
)
@@ -22,11 +22,10 @@ def handle_arguments(args):
if os.path.isfile(args.logging_ini_fname):
logging.config.fileConfig(args.logging_ini_fname)
else:
- sys.exit('Can not find logging ini file: %s' %
- args.logging_ini_fname)
+ sys.exit("Can not find logging ini file: %s" % args.logging_ini_fname)
- elif os.path.isfile('logging.ini'):
- logging.config.fileConfig('logging.ini')
+ elif os.path.isfile("logging.ini"):
+ logging.config.fileConfig("logging.ini")
def strip_control_chars(s):
diff --git a/gus/lib/misc.py b/gus/lib/misc.py
@@ -8,15 +8,24 @@ License: MIT
"""
SYMBOLS = {
- 'customary' : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'),
- 'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',
- 'zetta', 'iotta'),
- 'iec' : ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
- 'iec_ext' : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',
- 'zebi', 'yobi'),
+ "customary": ("B", "K", "M", "G", "T", "P", "E", "Z", "Y"),
+ "customary_ext": (
+ "byte",
+ "kilo",
+ "mega",
+ "giga",
+ "tera",
+ "peta",
+ "exa",
+ "zetta",
+ "iotta",
+ ),
+ "iec": ("Bi", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi"),
+ "iec_ext": ("byte", "kibi", "mebi", "gibi", "tebi", "pebi", "exbi", "zebi", "yobi"),
}
-def bytes2human(n, format='%(value).1f %(symbol)s', symbols='customary'):
+
+def bytes2human(n, format="%(value).1f %(symbol)s", symbols="customary"):
"""
Convert n bytes into a human readable string based on format.
symbols can be either "customary", "customary_ext", "iec" or "iec_ext",
@@ -59,7 +68,7 @@ def bytes2human(n, format='%(value).1f %(symbol)s', symbols='customary'):
symbols = SYMBOLS[symbols]
prefix = {}
for i, s in enumerate(symbols[1:]):
- prefix[s] = 1 << (i+1)*10
+ prefix[s] = 1 << (i + 1) * 10
for symbol in reversed(symbols[1:]):
if n >= prefix[symbol]:
value = float(n) / prefix[symbol]
diff --git a/gus/lib/whoosh_extensions.py b/gus/lib/whoosh_extensions.py
@@ -16,7 +16,12 @@ def UrlAnalyzer():
"""
- return RegexTokenizer(expression=":1965|^gemini://|[/\.\?]", gaps=True) | IntraWordFilter() | LowercaseFilter() | StemFilter()
+ return (
+ RegexTokenizer(expression=":1965|^gemini://|[/\.\?]", gaps=True)
+ | IntraWordFilter()
+ | LowercaseFilter()
+ | StemFilter()
+ )
class GeminiFormatter(highlight.Formatter):
@@ -35,7 +40,6 @@ class GeminiFormatter(highlight.Formatter):
# string
return "%s" % tokentext
-
def format_fragment(self, fragment, replace=False):
"""Returns a formatted version of the given text, using the "token"
objects in the given :class:`Fragment`.
@@ -57,21 +61,22 @@ class GeminiFormatter(highlight.Formatter):
if t.startchar < index:
continue
if t.startchar > index:
- output.append(self._text(text[index:t.startchar]))
+ output.append(self._text(text[index : t.startchar]))
output.append(self.format_token(text, t, replace))
index = t.endchar
- output.append(self._text(text[index:fragment.endchar]))
+ output.append(self._text(text[index : fragment.endchar]))
output.append("...")
out_string = "".join(output)
- out_string = out_string.replace("\n", " ").replace('\r', ' ')
- out_string = ' '.join(out_string.split())
+ out_string = out_string.replace("\n", " ").replace("\r", " ")
+ out_string = " ".join(out_string.split())
return out_string
special_char_pattern = re.compile("[^\w\s,\.;-\?\!']")
link_pattern = re.compile("://|=>")
+
class GeminiScorer(highlight.FragmentScorer):
def __call__(self, f):
# Add up the boosts for the matched terms in this passage
@@ -87,10 +92,12 @@ class GeminiScorer(highlight.FragmentScorer):
# ascii art, as well as source code (which, I suppose will make snippets
# lower quality for actual searches for source code, but that is a very
# small minority of searches in the current state of things).
- num_special_chars = len(special_char_pattern.findall(f.text[f.startchar:f.endchar]))
+ num_special_chars = len(
+ special_char_pattern.findall(f.text[f.startchar : f.endchar])
+ )
score -= 4 * num_special_chars + math.pow(num_special_chars, 1.5)
- num_links = len(link_pattern.findall(f.text[f.startchar:f.endchar]))
+ num_links = len(link_pattern.findall(f.text[f.startchar : f.endchar]))
score -= 30 * num_links
return max(0, score)
diff --git a/serve/constants.py b/serve/constants.py
@@ -44,9 +44,5 @@ QUOTE_BANK = [
"quote": "The truth will set you free. But not until it is finished with you.",
"author": "David Foster Wallace",
},
- {
- "quote": "Jazz isn't dead. It just smells funny.",
- "author": "Frank Zappa",
- },
-
+ {"quote": "Jazz isn't dead. It just smells funny.", "author": "Frank Zappa",},
]
diff --git a/serve/main.py b/serve/main.py
@@ -4,25 +4,13 @@ import jetforce
from . import app, gus
+
def parse_args():
parser = argparse.ArgumentParser()
- parser.add_argument(
- "--host",
- help="Server address to bind to",
- default="127.0.0.1"
- )
- parser.add_argument(
- "--port",
- help="Server port to bind to",
- type=int,
- default=1965
- )
- parser.add_argument(
- "--hostname",
- help="Server hostname",
- default="localhost"
- )
+ parser.add_argument("--host", help="Server address to bind to", default="127.0.0.1")
+ parser.add_argument("--port", help="Server port to bind to", type=int, default=1965)
+ parser.add_argument("--hostname", help="Server hostname", default="localhost")
parser.add_argument(
"--tls-certfile",
dest="certfile",
diff --git a/serve/models.py b/serve/models.py
@@ -9,11 +9,15 @@ from whoosh.index import open_dir
from . import constants
from gus.lib.db_model import init_db, Crawl, Link, Page, Search, Thread
from gus.lib.gemini import GeminiResource
-from gus.lib.index_statistics import compute_index_statistics, load_all_statistics_from_file
+from gus.lib.index_statistics import (
+ compute_index_statistics,
+ load_all_statistics_from_file,
+)
from gus.lib.misc import bytes2human
from gus.lib.whoosh_extensions import GeminiFormatter, GeminiScorer
-class GUS():
+
+class GUS:
def __init__(self):
self.ix = open_dir(constants.INDEX_DIR)
self.searcher = self.ix.searcher()
@@ -27,12 +31,15 @@ class GUS():
self.db = init_db(f"{constants.INDEX_DIR}/{constants.DB_FILENAME}")
self.statistics = compute_index_statistics(self.db)
- self.statistics_historical_overall = load_all_statistics_from_file(constants.STATISTICS_FILE)
-
+ self.statistics_historical_overall = load_all_statistics_from_file(
+ constants.STATISTICS_FILE
+ )
def init_query_parser(ix):
or_group = qparser.OrGroup.factory(0.99)
- query_parser = qparser.MultifieldParser(["content", "url", "prompt"], ix.schema, group=or_group)
+ query_parser = qparser.MultifieldParser(
+ ["content", "url", "prompt"], ix.schema, group=or_group
+ )
query_parser.add_plugin(qparser.RegexPlugin())
query_parser.add_plugin(qparser.GtLtPlugin())
query_parser.remove_plugin_class(qparser.WildcardPlugin)
@@ -40,36 +47,44 @@ class GUS():
query_parser.remove_plugin_class(qparser.RangePlugin)
return query_parser
-
def search_index(self, query, requested_page):
Search.create(query=query, timestamp=datetime.utcnow())
query = self.query_parser.parse(query)
results = self.searcher.search_page(query, requested_page, pagelen=10)
return (
len(results),
- [{
- "score" : result.score,
- "indexed_at" : result["indexed_at"],
- "url" : result["url"],
- "fetchable_url" : result["fetchable_url"],
- "content_type" : result["content_type"],
- "charset" : result["charset"] if "charset" in result else "none",
- "size" : result["size"] if "size" in result else 0,
- "prompt" : result["prompt"] if "prompt" in result else "",
- "highlights" : self.gemini_highlighter.highlight_hit(result, "content", top=1) if "content" in result and result["content_type"] in ["text/plain", "text/gemini", "text/markdown"] else "",
- "link_text" : GUS._get_link_text(result),
- "backlink_count": result["backlink_count"],
- } for result in results],
+ [
+ {
+ "score": result.score,
+ "indexed_at": result["indexed_at"],
+ "url": result["url"],
+ "fetchable_url": result["fetchable_url"],
+ "content_type": result["content_type"],
+ "charset": result["charset"] if "charset" in result else "none",
+ "size": result["size"] if "size" in result else 0,
+ "prompt": result["prompt"] if "prompt" in result else "",
+ "highlights": self.gemini_highlighter.highlight_hit(
+ result, "content", top=1
+ )
+ if "content" in result
+ and result["content_type"]
+ in ["text/plain", "text/gemini", "text/markdown"]
+ else "",
+ "link_text": GUS._get_link_text(result),
+ "backlink_count": result["backlink_count"],
+ }
+ for result in results
+ ],
)
-
def get_backlinks(self, url):
resource = GeminiResource(url)
if not resource.is_valid:
return [], []
u = resource.indexable_url.rstrip("/")
- backlinks_query = Page.raw("""SELECT p_from.url, l.is_cross_host_like
+ backlinks_query = Page.raw(
+ """SELECT p_from.url, l.is_cross_host_like
FROM page AS p_from
JOIN indexable_crawl AS ic
ON ic.page_id == p_from.id
@@ -80,18 +95,22 @@ ON p_to.id == l.to_page_id
WHERE p_to.url IN (?, ?)
AND p_from.normalized_url != ?
GROUP BY p_from.normalized_url
-ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalized_url)
+ORDER BY l.is_cross_host_like, p_from.url ASC""",
+ u,
+ f"{u}/",
+ resource.normalized_url,
+ )
backlinks = backlinks_query.execute()
internal_backlink_urls = [b.url for b in backlinks if not b.is_cross_host_like]
external_backlink_urls = [b.url for b in backlinks if b.is_cross_host_like]
return internal_backlink_urls, external_backlink_urls
-
def get_threads(self, sort="recency"):
sort = sort.lower()
if sort == "recency":
- threads_query = Thread.raw("""SELECT t.*
+ threads_query = Thread.raw(
+ """SELECT t.*
, tp.address
, tp.friendly_author
, tp.friendly_title
@@ -111,9 +130,11 @@ JOIN crawl AS c
ON c.page_id == p.id
WHERE c.status == 20
GROUP BY tp.id
-ORDER BY t.updated_at DESC, t.id ASC, tp.address ASC""")
+ORDER BY t.updated_at DESC, t.id ASC, tp.address ASC"""
+ )
elif sort == "length":
- threads_query = Thread.raw("""SELECT t.*
+ threads_query = Thread.raw(
+ """SELECT t.*
, tp.address
, tp.friendly_author
, tp.friendly_title
@@ -136,7 +157,8 @@ JOIN crawl AS c
ON c.page_id == p.id
WHERE c.status == 20
GROUP BY tp.id
-ORDER BY t.thread_length DESC, t.updated_at DESC, t.id ASC, tp.address ASC""")
+ORDER BY t.thread_length DESC, t.updated_at DESC, t.id ASC, tp.address ASC"""
+ )
else:
threads_query = ""
threads = []
@@ -144,44 +166,52 @@ ORDER BY t.thread_length DESC, t.updated_at DESC, t.id ASC, tp.address ASC""")
last_id = None
for thread_member in threads_query.iterator():
if thread_member.updated_at.date() != last_date:
- threads.append({
- "threads": [],
- "date": thread_member.updated_at,
- })
+ threads.append(
+ {"threads": [], "date": thread_member.updated_at,}
+ )
last_date = thread_member.updated_at.date()
if thread_member.id != last_id:
- threads[-1]["threads"].append({
- "members": [],
- "updated_at": thread_member.updated_at,
- })
+ threads[-1]["threads"].append(
+ {"members": [], "updated_at": thread_member.updated_at,}
+ )
last_id = thread_member.id
- threads[-1]["threads"][-1]["members"].append({
- "url": thread_member.url,
- "fetchable_url": thread_member.fetchable_url,
- "address": thread_member.address,
- "friendly_author": thread_member.friendly_author,
- "friendly_title": thread_member.friendly_title,
- "first_seen": datetime.strptime(thread_member.first_seen, "%Y-%m-%d %H:%M:%S.%f"),
- })
+ threads[-1]["threads"][-1]["members"].append(
+ {
+ "url": thread_member.url,
+ "fetchable_url": thread_member.fetchable_url,
+ "address": thread_member.address,
+ "friendly_author": thread_member.friendly_author,
+ "friendly_title": thread_member.friendly_title,
+ "first_seen": datetime.strptime(
+ thread_member.first_seen, "%Y-%m-%d %H:%M:%S.%f"
+ ),
+ }
+ )
# return sorted(threads, key=lambda x: (x["updated_at"], ), reverse=True)
return threads
-
def _get_link_text(result):
if result["content_type"] == "input":
prompt_suffix = ": {}".format(result["prompt"])
- link_text = "{} ({}{})".format(result["url"][9:], result["content_type"], prompt_suffix)
+ link_text = "{} ({}{})".format(
+ result["url"][9:], result["content_type"], prompt_suffix
+ )
else:
- lang_str = ", {}".format(result["lang"]) if "lang" in result and result["lang"] != "none" else ""
+ lang_str = (
+ ", {}".format(result["lang"])
+ if "lang" in result and result["lang"] != "none"
+ else ""
+ )
link_text = "{} ({}, {})".format(
- result["url"][9:], result["content_type"],
- bytes2human(result["size"], format="%(value).0f%(symbol)s")
+ result["url"][9:],
+ result["content_type"],
+ bytes2human(result["size"], format="%(value).0f%(symbol)s"),
)
return link_text
-
def get_feeds(self):
- feeds_query = Page.raw("""SELECT DISTINCT p.*
+ feeds_query = Page.raw(
+ """SELECT DISTINCT p.*
FROM page AS p
JOIN indexable_crawl AS c
ON c.page_id == p.id
@@ -190,12 +220,13 @@ OR p.url LIKE '%feed.xml'
OR p.url LIKE '%.rss'
OR p.url LIKE '%.atom'
OR p.content_type IN ('application/atom+xml', 'application/rss+xml')
-""")
+"""
+ )
return feeds_query.execute()
-
def get_newest_hosts(self):
- newest_hosts_query = Page.raw("""SELECT p.domain, MIN(c.timestamp) AS first_seen
+ newest_hosts_query = Page.raw(
+ """SELECT p.domain, MIN(c.timestamp) AS first_seen
FROM page as p
JOIN indexable_crawl AS ic
ON ic.page_id == p.id
@@ -204,12 +235,13 @@ ON c.page_id == p.id
GROUP BY p.domain
ORDER BY first_seen DESC
LIMIT 10
-""")
+"""
+ )
return newest_hosts_query.execute()
-
def get_newest_pages(self):
- newest_pages_query = Page.raw("""SELECT p.url, p.fetchable_url, MIN(c.timestamp) AS first_seen
+ newest_pages_query = Page.raw(
+ """SELECT p.url, p.fetchable_url, MIN(c.timestamp) AS first_seen
FROM page as p
JOIN indexable_crawl AS ic
ON ic.page_id == p.id
@@ -218,19 +250,19 @@ ON c.page_id == p.id
GROUP BY p.url
ORDER BY first_seen DESC
LIMIT 50
-""")
+"""
+ )
return newest_pages_query.execute()
-
def get_search_suggestions(self, query):
suggestions = []
corrector = self.searcher.corrector("content")
for query_part in query.split(" "):
query_part_suggestions = corrector.suggest(query_part, limit=3)
- suggestions.extend({
- "raw": suggestion,
- "quoted": quote(suggestion)
- } for suggestion in query_part_suggestions)
+ suggestions.extend(
+ {"raw": suggestion, "quoted": quote(suggestion)}
+ for suggestion in query_part_suggestions
+ )
return suggestions
diff --git a/serve/views.py b/serve/views.py
@@ -8,7 +8,12 @@ import jinja2
from jetforce import Request, Response, Status, JetforceApplication
from . import constants
-from .models import compute_verbose, compute_requested_results_page, GUS, process_seed_request
+from .models import (
+ compute_verbose,
+ compute_requested_results_page,
+ GUS,
+ process_seed_request,
+)
TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates")
@@ -19,6 +24,7 @@ template_env = jinja2.Environment(
lstrip_blocks=True,
)
+
def datetimeformat(value, format="%Y-%m-%d"):
return value.strftime(format)
@@ -29,8 +35,10 @@ def threadaddressformat(value):
return " " * (depth - 1) + "↳"
return ""
-template_env.filters['datetimeformat'] = datetimeformat
-template_env.filters['threadaddressformat'] = threadaddressformat
+
+template_env.filters["datetimeformat"] = datetimeformat
+template_env.filters["threadaddressformat"] = threadaddressformat
+
def render_template(name: str, *args, **kwargs) -> str:
"""
@@ -38,9 +46,11 @@ def render_template(name: str, *args, **kwargs) -> str:
"""
return template_env.get_template(name).render(*args, **kwargs)
+
app = JetforceApplication()
gus = GUS()
+
@app.route("/favicon.txt", strict_trailing_slash=False)
def favicon(request):
return Response(Status.SUCCESS, "text/plain", "🔭")
@@ -58,104 +68,128 @@ def add_seed(request):
@app.route("/statistics", strict_trailing_slash=False)
def statistics(request):
- body = render_template("statistics.gmi",
- statistics=gus.statistics,
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "statistics.gmi",
+ statistics=gus.statistics,
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
return Response(Status.SUCCESS, "text/gemini", body)
@app.route("/statistics/historical/overall", strict_trailing_slash=False)
def statistics(request):
- body = render_template("statistics_historical_overall.gmi",
- statistics_historical_overall=gus.statistics_historical_overall,
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "statistics_historical_overall.gmi",
+ statistics_historical_overall=gus.statistics_historical_overall,
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
return Response(Status.SUCCESS, "text/gemini", body)
@app.route("/known-hosts", strict_trailing_slash=False)
def known_hosts(request):
- body = render_template("known_hosts.gmi",
- # TODO: remove this `sorted` after the next index generation
- known_hosts=sorted(gus.statistics["domains"]),
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "known_hosts.gmi",
+ # TODO: remove this `sorted` after the next index generation
+ known_hosts=sorted(gus.statistics["domains"]),
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
return Response(Status.SUCCESS, "text/gemini", body)
@app.route("/newest-hosts", strict_trailing_slash=False)
def newest_hosts(request):
- body = render_template("newest_hosts.gmi",
- newest_hosts=gus.get_newest_hosts(),
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "newest_hosts.gmi",
+ newest_hosts=gus.get_newest_hosts(),
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
return Response(Status.SUCCESS, "text/gemini", body)
@app.route("/newest-pages", strict_trailing_slash=False)
def newest_pages(request):
- body = render_template("newest_pages.gmi",
- newest_pages=gus.get_newest_pages(),
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "newest_pages.gmi",
+ newest_pages=gus.get_newest_pages(),
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
return Response(Status.SUCCESS, "text/gemini", body)
@app.route("/known-feeds", strict_trailing_slash=False)
def known_feeds(request):
- body = render_template("known_feeds.gmi",
- known_feeds=gus.get_feeds(),
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "known_feeds.gmi",
+ known_feeds=gus.get_feeds(),
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
return Response(Status.SUCCESS, "text/gemini", body)
@app.route("", strict_trailing_slash=False)
def index(request):
- body = render_template("index.gmi",
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "index.gmi",
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
return Response(Status.SUCCESS, "text/gemini", body)
@app.route("/about", strict_trailing_slash=False)
def index(request):
- body = render_template("about.gmi",
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "about.gmi",
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
return Response(Status.SUCCESS, "text/gemini", body)
@app.route("/documentation/searching", strict_trailing_slash=False)
def documentation_searching(request):
- body = render_template("documentation/searching.gmi",
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "documentation/searching.gmi",
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
return Response(Status.SUCCESS, "text/gemini", body)
@app.route("/documentation/indexing", strict_trailing_slash=False)
def documentation_indexing(request):
- body = render_template("documentation/indexing.gmi",
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "documentation/indexing.gmi",
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
return Response(Status.SUCCESS, "text/gemini", body)
@app.route("/documentation/backlinks", strict_trailing_slash=False)
def documentation_backlinks(request):
- body = render_template("documentation/backlinks.gmi",
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "documentation/backlinks.gmi",
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
return Response(Status.SUCCESS, "text/gemini", body)
@app.route("/news", strict_trailing_slash=False)
def index(request):
- body = render_template("news.gmi",
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "news.gmi",
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
return Response(Status.SUCCESS, "text/gemini", body)
@@ -171,23 +205,27 @@ def search(request):
current_page = min(requested_page, num_pages)
if num_results == 0:
current_page = 0
- body = render_template("search.gmi",
- query=request.query,
- quoted_query=quote(request.query),
- verbose=verbose,
- num_results=num_results,
- results=results,
- current_page=current_page,
- num_pages=num_pages,
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "search.gmi",
+ query=request.query,
+ quoted_query=quote(request.query),
+ verbose=verbose,
+ num_results=num_results,
+ results=results,
+ current_page=current_page,
+ num_pages=num_pages,
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
else:
search_suggestions = gus.get_search_suggestions(request.query)
- body = render_template("search_suggestions.gmi",
- query=request.query,
- search_suggestions=search_suggestions,
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "search_suggestions.gmi",
+ query=request.query,
+ search_suggestions=search_suggestions,
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
return Response(Status.SUCCESS, "text/gemini", body)
else:
return Response(Status.INPUT, "Search query")
@@ -213,12 +251,14 @@ def backlinks(request):
if request.query:
url = unquote(request.query)
internal_backlinks, external_backlinks = gus.get_backlinks(url)
- body = render_template("backlinks.gmi",
- url=url,
- internal_backlinks=internal_backlinks,
- external_backlinks=external_backlinks,
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "backlinks.gmi",
+ url=url,
+ internal_backlinks=internal_backlinks,
+ external_backlinks=external_backlinks,
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
return Response(Status.SUCCESS, "text/gemini", body)
else:
return Response(Status.INPUT, "Gemini URL")
@@ -228,9 +268,11 @@ def backlinks(request):
def threads(request):
sort = request.query or "recency"
threads = gus.get_threads(sort)
- body = render_template("threads.gmi",
- threads=threads,
- sort=sort,
- index_modification_time=gus.statistics["index_modification_time"],
- quote=random.choice(constants.QUOTE_BANK))
+ body = render_template(
+ "threads.gmi",
+ threads=threads,
+ sort=sort,
+ index_modification_time=gus.statistics["index_modification_time"],
+ quote=random.choice(constants.QUOTE_BANK),
+ )
return Response(Status.SUCCESS, "text/gemini", body)
diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py
@@ -3,27 +3,31 @@ from gus.lib.gemini import GeminiResource
class TestGeminiResource(unittest.TestCase):
-
def test_extract_contained_resources(self):
- url = 'gemini://host'
+ url = "gemini://host"
# no content
- resources = GeminiResource(url).extract_contained_resources('')
+ resources = GeminiResource(url).extract_contained_resources("")
self.assertEqual(resources, [])
# not a link
- resources = GeminiResource(url).extract_contained_resources(' => link')
+ resources = GeminiResource(url).extract_contained_resources(" => link")
self.assertEqual(resources, [])
- resources = GeminiResource(url).extract_contained_resources('```\n=> preformatted\n```')
+ resources = GeminiResource(url).extract_contained_resources(
+ "```\n=> preformatted\n```"
+ )
self.assertEqual(resources, [])
# some links
- resources = GeminiResource(url).extract_contained_resources('=> link\ntext\n=> other')
+ resources = GeminiResource(url).extract_contained_resources(
+ "=> link\ntext\n=> other"
+ )
self.assertEqual(len(resources), 2)
- self.assertEqual(resources[0].raw_url, 'link')
- self.assertEqual(resources[1].raw_url, 'other')
+ self.assertEqual(resources[0].raw_url, "link")
+ self.assertEqual(resources[1].raw_url, "other")
- resources = GeminiResource(url).extract_contained_resources("""
+ resources = GeminiResource(url).extract_contained_resources(
+ """
# title
text
=> link
@@ -32,7 +36,8 @@ text
=> no link
```
=> other
- """)
+ """
+ )
self.assertEqual(len(resources), 2)
- self.assertEqual(resources[0].raw_url, 'link')
- self.assertEqual(resources[1].raw_url, 'other')
+ self.assertEqual(resources[0].raw_url, "link")
+ self.assertEqual(resources[1].raw_url, "other")