commit fa2db540f6a05e5be6423a0e6c4d24950f8fbdf8
parent b484a4dadc7b58eac2f13188a748f64a83d58a26
Author: René Wagner <rwa@clttr.info>
Date: Thu, 14 Oct 2021 18:54:12 +0200
more meta data for index cleanup
Diffstat:
6 files changed, 24 insertions(+), 40 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -79,36 +79,23 @@ AND l.is_cross_host_like == 1""",
return False
-def invalidate_recent_results(index, invalidation_window):
- recency_minimum = datetime.now() - timedelta(hours=invalidation_window)
- pages = Page.select().where(
- Page.indexed_at.is_null(False), Page.indexed_at > recency_minimum
- )
- logging.debug('Invalidating %d pages %s', pages.count(), recency_minimum)
- for page in pages:
- index.delete_by_term("url_id", page.url)
-
-
-def build_index(should_run_destructive=False, invalidation_window=0):
+def build_index(should_run_destructive=False):
index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR
-# index_dir = constants.INDEX_DIR_NEW
db = init_db(index_dir + "/gus.sqlite")
index = search.Index(index_dir, should_run_destructive)
- invalidate_recent_results(index, invalidation_window)
-
if (should_run_destructive):
pages = Page.raw(
"""SELECT p.* FROM page AS p
-WHERE p.last_status == 20
+WHERE p.last_success_status == 20
AND (p.content_type NOT LIKE 'text/%'
OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE
)
else:
pages = Page.raw(
"""SELECT p.* FROM page AS p
-WHERE p.last_status == 20
+WHERE p.last_success_status == 20
AND (p.indexed_at IS NULL OR
p.indexed_at < p.last_crawl_success_at)
AND (p.content_type NOT LIKE 'text/%'
@@ -126,14 +113,14 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA
if (should_run_destructive):
pages = Page.raw(
"""SELECT p.* FROM page AS p
-WHERE p.last_status == 20
+WHERE p.last_success_status == 20
AND (p.content_type NOT LIKE 'text/%'
OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE
)
else:
pages = Page.raw(
"""SELECT p.* FROM page AS p
-WHERE p.last_status == 20
+WHERE p.last_success_status == 20
AND (p.indexed_at IS NULL OR
p.indexed_at < p.last_crawl_success_at)
AND (p.content_type NOT LIKE 'text/%'
@@ -159,7 +146,7 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA
def main():
args = parse_args()
gus.lib.logging.handle_arguments(args)
- build_index(args.should_run_destructive, args.invalidation_window)
+ build_index(args.should_run_destructive)
def parse_args():
@@ -172,14 +159,6 @@ def parse_args():
default=False,
help="create a fresh index",
)
- parser.add_argument(
- "--invalidation_window",
- "-i",
- dest="invalidation_window",
- type=int,
- default=0,
- help="a recency window, in hours, for recently crawled pages that should be forcefully reindexed",
- )
gus.lib.logging.add_arguments(parser)
args = parser.parse_args()
return args
diff --git a/gus/constants.py b/gus/constants.py
@@ -6,7 +6,7 @@ DB_FILENAME = "gus.sqlite"
FEED_FILE = "feeds.txt"
MAXIMUM_REDIRECT_CHAIN_LENGTH = 5
MAXIMUM_FAILED_REQUEST_COUNT = 5
-MAXIMUM_TEXT_PAGE_SIZE = 5120000 # 1000KB, in bytes
+MAXIMUM_TEXT_PAGE_SIZE = 5120000 # in bytes
# default change frequencies (in hours)
ROOT_CHANGE_FREQUENCY_DEFAULT = 24
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -49,7 +49,8 @@ def index_binary(resource, response):
"last_crawl_at": datetime.utcnow(),
"last_crawl_success_at": datetime.utcnow(),
"last_status" : response.status,
- "last_stats_message" : response.error_message,
+ "last_success_status": response.status,
+ "last_status_message" : response.error_message,
"first_seen_at" : datetime.utcnow()
}
existing_page = Page.get_or_none(url=resource.normalized_url)
@@ -89,7 +90,8 @@ def index_redirect(resource, response):
"last_crawl_at": datetime.utcnow(),
"last_crawl_success_at": datetime.utcnow(),
"last_status" : response.status,
- "last_stats_message" : response.error_message,
+ "last_success_status" : response.status,
+ "last_status_message" : response.error_message,
"first_seen_at" : datetime.utcnow()
}
existing_page = Page.get_or_none(url=resource.normalized_url)
@@ -164,7 +166,8 @@ def index_prompt(resource, response):
"last_crawl_at": datetime.utcnow(),
"last_crawl_success_at": datetime.utcnow(),
"last_status" : response.status,
- "last_stats_message" : response.error_message,
+ "last_success_status" : response.status,
+ "last_status_message" : response.error_message,
"first_seen_at" : datetime.utcnow()
}
existing_page = Page.get_or_none(url=resource.normalized_url)
@@ -208,7 +211,8 @@ def index_content(resource, response):
"last_crawl_at": datetime.utcnow(),
"last_crawl_success_at": datetime.utcnow(),
"last_status" : response.status,
- "last_stats_message" : response.error_message,
+ "last_success_status" : response.status,
+ "last_status_message" : response.error_message,
"first_seen_at" : datetime.utcnow()
}
if response.content_type == "text/gemini":
@@ -257,10 +261,10 @@ def should_skip(resource):
if m:
should_skip = True
except:
- logging.error("Error checking for exclude of url: %s", gus.lib.logging.strip_control_chars(resource.normalized_url))
- should_skip = True
+ logging.error("Error checking for exclude of url: %s", gus.lib.logging.strip_control_chars(resource.raw_url))
+ should_skip = True
- return should_skip
+ return should_skip
def index_links(from_resource, contained_resources):
diff --git a/gus/excludes.py b/gus/excludes.py
@@ -2,7 +2,6 @@
# prepended with the gemini:// protocol, be all lowercased, and
# not have the port specified if it is 1965.
EXCLUDED_URL_PREFIXES = [
- # test and other invalid URIs
"gemini://localhost",
"gemini://example.org",
"gemini://example.com",
@@ -136,6 +135,7 @@ EXCLUDED_URL_PREFIXES = [
"gemini://gemini.susa.net/cgi-bin/search?",
"gemini://gemini.susa.net/cgi-bin/twitter?",
"gemini://gemini.susa.net/cgi-bin/vim-search?",
+ "gemini://gemini.susa.net/cgi-bin/links_stu.lua?",
"gemini://gemini.spam.works/textfiles/",
"gemini://gemini.spam.works/mirrors/textfiles/",
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -46,6 +46,7 @@ class Page(Model):
last_crawl_success_at = DateTimeField(null=True)
last_status = IntegerField(null=True)
last_status_message = TextField(null=True)
+ last_success_status = IntegerField(null=True)
first_seen_at = DateTimeField(null=True)
class Link(Model):
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -10,11 +10,11 @@ from gus.lib.db_model import Page
def compute_index_statistics(db):
page_count = len(Page.raw("""SELECT DISTINCT p.id
FROM page AS p
-WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""").dicts())
+WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL""").dicts())
domains_query = Page.raw("""SELECT DISTINCT p.domain, p.port
FROM page AS p
-WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""")
+WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL""")
domains = []
for d in domains_query.execute():
s = d.domain
@@ -32,12 +32,12 @@ WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""")
content_type_frequencies = (Page.raw("""SELECT p.content_type, count(p.content_type) as 'count'
FROM page AS p
-WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL
+WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL
GROUP BY p.content_type
ORDER BY 2 desc""").dicts())
charset_frequencies = (Page.raw("""SELECT upper(p.charset), count(p.id) as 'count'
FROM page AS p
-WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL AND p.charset IS NOT NULL
+WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL AND p.charset IS NOT NULL
GROUP BY upper(p.charset)
ORDER BY 2 desc""").dicts())
index_modification_time = Page.select(fn.Max(Page.last_crawl_at)).scalar()