geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 87d92bbfb37a77d905812d036b79c6f717c5137a
parent 80e589b1d481bb85344c9341bb61d690e58200de
Author: René Wagner <rwa@clttr.info>
Date:   Mon, 12 Jul 2021 21:37:55 +0200

index text files up to 5 MB

fix flagging pages as indexed

Diffstat:
Mgus/build_index.py | 34+++++++++++++++++++---------------
Mgus/constants.py | 2+-
Mgus/lib/index_statistics.py | 7++++---
Mserve/templates/fragments/footer.gmi | 3+--
4 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -67,10 +67,8 @@ AND l.is_cross_host_like == 1""", try: logging.debug("Adding document to index: %s", page.url); index.add_document(document) -# logging.debug("Updating ge in sqlite store: %s", page.url) -# page.indexed_at=datetime.utcnow() -# page.save() logging.debug("Document done") + return True except Exception as e: logging.exception( @@ -124,22 +122,28 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA logging.info("Commiting search index...") index.close() logging.info("Updating raw data...") + if (should_run_destructive): pages = Page.raw( - """UPDATE page SET indexed_at = ? -WHERE last_status == 20 -AND (content_type NOT LIKE 'text/%' -OR (content_type LIKE 'text/%' AND size <= ?))""", -datetime.utcnow(), constants.MAXIMUM_TEXT_PAGE_SIZE) + """SELECT p.* FROM page AS p +WHERE p.last_status == 20 +AND (p.content_type NOT LIKE 'text/%' +OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE + ) else: pages = Page.raw( - """UPDATE page SET indexed_at = ? -WHERE last_status == 20 -AND (indexed_at IS NULL OR -indexed_at < last_crawl_success_at) -AND (content_type NOT LIKE 'text/%' -OR (content_type LIKE 'text/%' AND size <= ?))""", -datetime.utcnow(), constants.MAXIMUM_TEXT_PAGE_SIZE) + """SELECT p.* FROM page AS p +WHERE p.last_status == 20 +AND (p.indexed_at IS NULL OR +p.indexed_at < p.last_crawl_success_at) +AND (p.content_type NOT LIKE 'text/%' +OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE + ) + + timestamp = datetime.utcnow() + for page in pages.iterator(): + page.indexed_at = timestamp; + page.save() except Exception as e: logging.error('Closing of index failed: %s', e); diff --git a/gus/constants.py b/gus/constants.py @@ -6,7 +6,7 @@ DB_FILENAME = "gus.sqlite" FEED_FILE = "feeds.txt" MAXIMUM_REDIRECT_CHAIN_LENGTH = 5 MAXIMUM_FAILED_REQUEST_COUNT = 5 -MAXIMUM_TEXT_PAGE_SIZE = 1024000 # 1000KB, in bytes +MAXIMUM_TEXT_PAGE_SIZE = 5120000 # 1000KB, in bytes # default change frequencies (in hours) ROOT_CHANGE_FREQUENCY_DEFAULT = 24 diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -10,11 +10,11 @@ from gus.lib.db_model import Page def compute_index_statistics(db): page_count = len(Page.raw("""SELECT DISTINCT p.id FROM page AS p -WHERE last_crawl_success_at IS NOT NULL""").dicts()) +WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""").dicts()) domains_query = Page.raw("""SELECT DISTINCT p.domain, p.port FROM page AS p -WHERE last_crawl_success_at IS NOT NULL""") +WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""") domains = [] for d in domains_query.execute(): s = d.domain @@ -32,11 +32,12 @@ WHERE last_crawl_success_at IS NOT NULL""") content_type_frequencies = (Page.raw("""SELECT p.content_type, count(p.content_type) as 'count' FROM page AS p +WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL GROUP BY p.content_type ORDER BY 2 desc""").dicts()) charset_frequencies = (Page.raw("""SELECT upper(p.charset), count(p.id) as 'count' FROM page AS p -WHERE p.charset IS NOT NULL +WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL AND p.charset IS NOT NULL GROUP BY upper(p.charset) ORDER BY 2 desc""").dicts()) index_modification_time = Page.select(fn.Max(Page.last_crawl_at)).scalar() diff --git a/serve/templates/fragments/footer.gmi b/serve/templates/fragments/footer.gmi @@ -1,5 +1,4 @@ => /add-seed missing results? add your capsule to geminispace.info -Index updated on: {{ index_modification_time|datetimeformat }} - +> Index updated on: {{ index_modification_time|datetimeformat }}