commit 80e589b1d481bb85344c9341bb61d690e58200de
parent cddbb82dfd62c8e7e2601fd53b46359cdca1bb06
Author: René Wagner <rwa@clttr.info>
Date: Mon, 12 Jul 2021 19:27:57 +0200
commit search index only when indexing is complete
unnecessary commits during indexing are time-consuming
remove dead "feedparser" code from crawl
Diffstat:
4 files changed, 26 insertions(+), 124 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -67,9 +67,10 @@ AND l.is_cross_host_like == 1""",
try:
logging.debug("Adding document to index: %s", page.url);
index.add_document(document)
- page.indexed_at=datetime.utcnow()
- page.save()
-
+# logging.debug("Updating ge in sqlite store: %s", page.url)
+# page.indexed_at=datetime.utcnow()
+# page.save()
+ logging.debug("Document done")
return True
except Exception as e:
logging.exception(
@@ -120,10 +121,30 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA
index_page(index, page)
try:
+ logging.info("Commiting search index...")
index.close()
+ logging.info("Updating raw data...")
+ if (should_run_destructive):
+ pages = Page.raw(
+ """UPDATE page SET indexed_at = ?
+WHERE last_status == 20
+AND (content_type NOT LIKE 'text/%'
+OR (content_type LIKE 'text/%' AND size <= ?))""",
+datetime.utcnow(), constants.MAXIMUM_TEXT_PAGE_SIZE)
+ else:
+ pages = Page.raw(
+ """UPDATE page SET indexed_at = ?
+WHERE last_status == 20
+AND (indexed_at IS NULL OR
+indexed_at < last_crawl_success_at)
+AND (content_type NOT LIKE 'text/%'
+OR (content_type LIKE 'text/%' AND size <= ?))""",
+datetime.utcnow(), constants.MAXIMUM_TEXT_PAGE_SIZE)
+
except Exception as e:
logging.error('Closing of index failed: %s', e);
+ logging.debug("Updating statistics...")
index_statistics = compute_index_statistics(db)
log_index_statistics(index_statistics)
persist_statistics(index_statistics, None, should_run_destructive, "statistics.csv")
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -8,13 +8,6 @@ import pathlib
import time
from urllib.parse import urljoin, uses_relative, uses_netloc
-# TODO: this import breaks with Python 3.9, but all code that relies
-# on this code path is currently dead code, so for now I'm just
-# commenting out the import. It would be nice to make an actual
-# decision soon about whether or not feed-based updating is even
-# worth keeping around. If not, the dead code paths could simply
-# deleted, and GUS would get a bit simpler :)
-# import feedparser
import peewee
from gus.excludes import EXCLUDED_URL_PREFIXES, EXCLUDED_URL_PATHS
@@ -510,99 +503,6 @@ def load_seed_request_urls():
content = [x.strip() for x in content]
return content
-
-def load_feed_urls(filename):
- feeds = []
- with open(filename, "r") as fp:
- for line in fp:
- line = line.strip()
- if not line or line.startswith("#"):
- continue
- feeds.append(line)
- return feeds
-
-
-def items_from_feed_string(feed_str):
- feed_obj = feedparser.parse(feed_str)
- feed = feed_obj.feed
- return [
- (entry.updated_parsed, entry.link, entry.title, feed.title)
- for entry in feed_obj.entries
- ]
-
-
-def resolve_feed_content_urls(feed_file=constants.FEED_FILE):
- # Load feed URLs to query
- feed_urls = load_feed_urls(feed_file)
- N = len(feed_urls)
-
- # Prepare to extract feed items
- last_accessed = {}
- skips = 0
- items = []
- while feed_urls:
- # Get a feed URL to fetch
- feed_url = feed_urls.pop()
- feed_resource = GeminiResource(feed_url)
-
- # Don't hammer servers
- last = last_accessed.get(feed_resource.normalized_host, 0)
- now = time.time()
- interval = int(now - last)
- if interval < 5:
- logging.warn(
- "Declining to hit %s again after only %d seconds",
- gus.lib.logging.strip_control_chars(feed_resource.normalized_host),
- interval,
- )
- feed_urls.insert(0, feed_url)
- skips += 1
- if skips == len(feed_urls):
- # We've hammered every server in the queue! Sleep a bit...
- logging.warn("Sleeping to give all servers a rest!")
- time.sleep(5)
- continue
- skips = 0
-
- # Good to go
- logging.info("Fetching feed: %s", gus.lib.logging.strip_control_chars(feed_url))
- try:
- resp = feed_resource.fetch()
- except:
- logging.info(
- "Error fetching feed, skipping: %s",
- gus.lib.logging.strip_control_chars(feed_url),
- )
- continue
- if resp and resp.status == "20":
- last_accessed[feed_resource.normalized_host] = time.time()
- items.extend(items_from_feed_string(resp.content))
- return [item[1] for item in items]
-
-
-def recrawl_feeds():
- content_urls = resolve_feed_content_urls()
- global index_dir
- index_dir = constants.INDEX_DIR
- global db
- db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
- global max_crawl_depth
- max_crawl_depth = 0
- global robot_file_map
- robot_file_map = {}
- global domain_hit_timings
- domain_hit_timings = {}
-
- seed_resources = [GeminiResource(url) for url in content_urls]
- for resource in seed_resources:
- crawl_page(resource, 0)
-
- logging.debug(
- "Recrawled feeds: %s", gus.lib.logging.strip_control_chars(content_urls)
- )
- logging.info("Finished!")
-
-
def run_crawl(should_run_destructive=False, seed_urls=[]):
global index_dir
index_dir = constants.INDEX_DIR if should_run_destructive else constants.INDEX_DIR
@@ -632,10 +532,7 @@ def main():
args = parse_args()
gus.lib.logging.handle_arguments(args)
- if args.should_recrawl_feeds:
- recrawl_feeds()
- else:
- run_crawl(args.should_run_destructive, seed_urls=args.seed_urls)
+ run_crawl(args.should_run_destructive, seed_urls=args.seed_urls)
def parse_args():
@@ -649,14 +546,6 @@ def parse_args():
help="create a fresh index and perform a full Geminispace crawl",
)
parser.add_argument(
- "--feeds",
- "-f",
- dest="should_recrawl_feeds",
- action="store_true",
- default=False,
- help="recrawl known atom feeds",
- )
- parser.add_argument(
"--seeds",
"-s",
metavar="URL",
@@ -669,6 +558,5 @@ def parse_args():
args = parser.parse_args()
return args
-
if __name__ == "__main__":
main()
diff --git a/gus/excludes.py b/gus/excludes.py
@@ -140,7 +140,7 @@ EXCLUDED_URL_PREFIXES = [
"gemini://gemini.cabestan.tk/hn",
"gemini://hn.filiuspatris.net/",
"gemini://schmittstefan.de/de/nachrichten/",
-
+ "gemini://gmi.noulin.net/mobile"
# wikipedia proxy
"gemini://wp.pitr.ca/",
"gemini://wp.glv.one/",
diff --git a/gus/lib/search.py b/gus/lib/search.py
@@ -41,7 +41,6 @@ class Index:
order=whoosh.highlight.SCORE,
)
- self._write_counter = 0
self._writer = None
def _create(self, index_storage):
@@ -76,12 +75,6 @@ class Index:
self._index.close()
def _rolling_writer(self):
- self._write_counter += 1
- if self._writer and self._write_counter % 5000 == 0:
- logging.debug("Committing index.")
- self._writer.commit()
- self._writer = None
-
if not self._writer:
self._writer = self._index.writer()