commit 87d92bbfb37a77d905812d036b79c6f717c5137a
parent 80e589b1d481bb85344c9341bb61d690e58200de
Author: René Wagner <rwa@clttr.info>
Date: Mon, 12 Jul 2021 21:37:55 +0200
index text files up to 5 MB
fix flagging pages as indexed
Diffstat:
4 files changed, 25 insertions(+), 21 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -67,10 +67,8 @@ AND l.is_cross_host_like == 1""",
try:
logging.debug("Adding document to index: %s", page.url);
index.add_document(document)
-# logging.debug("Updating ge in sqlite store: %s", page.url)
-# page.indexed_at=datetime.utcnow()
-# page.save()
logging.debug("Document done")
+
return True
except Exception as e:
logging.exception(
@@ -124,22 +122,28 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA
logging.info("Commiting search index...")
index.close()
logging.info("Updating raw data...")
+
if (should_run_destructive):
pages = Page.raw(
- """UPDATE page SET indexed_at = ?
-WHERE last_status == 20
-AND (content_type NOT LIKE 'text/%'
-OR (content_type LIKE 'text/%' AND size <= ?))""",
-datetime.utcnow(), constants.MAXIMUM_TEXT_PAGE_SIZE)
+ """SELECT p.* FROM page AS p
+WHERE p.last_status == 20
+AND (p.content_type NOT LIKE 'text/%'
+OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE
+ )
else:
pages = Page.raw(
- """UPDATE page SET indexed_at = ?
-WHERE last_status == 20
-AND (indexed_at IS NULL OR
-indexed_at < last_crawl_success_at)
-AND (content_type NOT LIKE 'text/%'
-OR (content_type LIKE 'text/%' AND size <= ?))""",
-datetime.utcnow(), constants.MAXIMUM_TEXT_PAGE_SIZE)
+ """SELECT p.* FROM page AS p
+WHERE p.last_status == 20
+AND (p.indexed_at IS NULL OR
+p.indexed_at < p.last_crawl_success_at)
+AND (p.content_type NOT LIKE 'text/%'
+OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE
+ )
+
+ timestamp = datetime.utcnow()
+ for page in pages.iterator():
+ page.indexed_at = timestamp;
+ page.save()
except Exception as e:
logging.error('Closing of index failed: %s', e);
diff --git a/gus/constants.py b/gus/constants.py
@@ -6,7 +6,7 @@ DB_FILENAME = "gus.sqlite"
FEED_FILE = "feeds.txt"
MAXIMUM_REDIRECT_CHAIN_LENGTH = 5
MAXIMUM_FAILED_REQUEST_COUNT = 5
-MAXIMUM_TEXT_PAGE_SIZE = 1024000 # 1000KB, in bytes
+MAXIMUM_TEXT_PAGE_SIZE = 5120000 # 1000KB, in bytes
# default change frequencies (in hours)
ROOT_CHANGE_FREQUENCY_DEFAULT = 24
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -10,11 +10,11 @@ from gus.lib.db_model import Page
def compute_index_statistics(db):
page_count = len(Page.raw("""SELECT DISTINCT p.id
FROM page AS p
-WHERE last_crawl_success_at IS NOT NULL""").dicts())
+WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""").dicts())
domains_query = Page.raw("""SELECT DISTINCT p.domain, p.port
FROM page AS p
-WHERE last_crawl_success_at IS NOT NULL""")
+WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""")
domains = []
for d in domains_query.execute():
s = d.domain
@@ -32,11 +32,12 @@ WHERE last_crawl_success_at IS NOT NULL""")
content_type_frequencies = (Page.raw("""SELECT p.content_type, count(p.content_type) as 'count'
FROM page AS p
+WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL
GROUP BY p.content_type
ORDER BY 2 desc""").dicts())
charset_frequencies = (Page.raw("""SELECT upper(p.charset), count(p.id) as 'count'
FROM page AS p
-WHERE p.charset IS NOT NULL
+WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL AND p.charset IS NOT NULL
GROUP BY upper(p.charset)
ORDER BY 2 desc""").dicts())
index_modification_time = Page.select(fn.Max(Page.last_crawl_at)).scalar()
diff --git a/serve/templates/fragments/footer.gmi b/serve/templates/fragments/footer.gmi
@@ -1,5 +1,4 @@
=> /add-seed missing results? add your capsule to geminispace.info
-Index updated on: {{ index_modification_time|datetimeformat }}
-
+> Index updated on: {{ index_modification_time|datetimeformat }}