fix crawl, revamp crawl & indexing procedure - geminispace.info

commit c562254a65a505dc9f5e5c54cb8ee6be810c644a
parent 6ea586ae2380fcdedf5c83d5ee99438370390cad
Author: René Wagner <rwa@clttr.info>
Date:   Fri,  6 Jan 2023 11:55:41 +0100

fix crawl, revamp crawl & indexing procedure

Diffstat:
M README.md  | 4 +---
M gus/crawl.py  | 7 ++++++-
M infra/filter_seed_request.sh  | 6 +++---
D infra/gus-crawl.service  | 13 -------------
D infra/gus-index.service  | 13 -------------
A infra/update_index.sh  | 6 ++++++

6 files changed, 16 insertions(+), 33 deletions(-)
diff --git a/README.md b/README.md
@@ -38,9 +38,7 @@ Now you'll have created `index.new` directory, rename it to `index`.
 
 ### Running the crawl & indexer in production with systemd
 
-1. update `infra/gus-crawl.service` & `infra/gus-index.service` to match your needs (directory, user)
-2. copy both files to `/etc/systemd/system/`
-3. set up a cron job for root with the following params: `0 9 */3 * * systemctl start gus-crawl --no-block` 
+3. set up a cron job with the following params: `0 9 * * * /home/gus/infra/update-index.sh /home/gus` 
 
 ## Running the test suite
 
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -514,8 +514,13 @@ def crawl_page(
 
 
 def load_expired_urls():
-    expired_pages = Page.select(Page.ur).where(last_crawl_at < (datetime.now() + timedelta(hours=(Page.change_frequency * -1))) & Page.last_crawl_at.is_null(True))
+    expired_pages = Page.raw(
+           """SELECT p.url
+             FROM page as p
+             WHERE datetime(last_crawl_at, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now') OR last_crawl_at IS NULL""" )
     return [page.url for page in expired_pages.execute()]
+#    expired_pages = Page.select(Page.url).where(Page.last_crawl_at < (datetime.now() - timedelta(hours=Page.change_frequency)) & Page.last_crawl_at.is_null(True))
+#    return expired_pages
 
 def load_seed_request_urls():
     with open("seed-requests.txt") as f:
diff --git a/infra/filter_seed_request.sh b/infra/filter_seed_request.sh
@@ -1,3 +1,3 @@
-cd /home/gus/
-cat seed-requests.txt | grep -iE '^gemini:\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,10}\b[-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*$' | sort | uniq > seed-requests.txt2
-mv seed-requests.txt2 seed-requests.txt
+BASEDIR=${1}
+cat ${BASEDIR}/seed-requests.txt | grep -iE '^gemini:\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,10}\b[-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*$' | sort | uniq > ${BASEDIR}/seed-requests.txt2
+mv ${BASEDIR}/seed-requests.txt2 ${BASEDIR}/seed-requests.txt
diff --git a/infra/gus-crawl.service b/infra/gus-crawl.service
@@ -1,13 +0,0 @@
-# /etc/systemd/system/gus.service
-
-[Unit]
-Description=Gemini Universal Search - Crawler
-
-[Service]
-User=gus
-Group=gus
-Type=oneshot
-WorkingDirectory=/home/gus/
-Environment="PYTHONUNBUFFERED=1"
-ExecStart=/home/gus/.poetry/bin/poetry run crawl
-ExecStopPost=sudo systemctl start gus-index --no-block
diff --git a/infra/gus-index.service b/infra/gus-index.service
@@ -1,13 +0,0 @@
-# /etc/systemd/system/gus.service
-
-[Unit]
-Description=Gemini Universal Search - Indexer
-
-[Service]
-User=gus
-Group=gus
-Type=oneshot
-WorkingDirectory=/home/gus
-Environment="PYTHONUNBUFFERED=1"
-ExecStart=/home/gus/.poetry/bin/poetry run build_index
-ExecStopPost=sudo systemctl restart gus
diff --git a/infra/update_index.sh b/infra/update_index.sh
@@ -0,0 +1,6 @@
+BASEDIR=${1}
+rm ${BASEDIR}/logs/gus.log
+${BASEDIR}/infra/filter_seed_request.sh ${BASEDIR}
+${BASEDIR}/.poetry/bin/poetry run crawl
+${BASEDIR}/.poetry/bin/poetry run build_index
+sudo systemctl restart gus

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	README.md	\|	4	+---
M	gus/crawl.py	\|	7	++++++-
M	infra/filter_seed_request.sh	\|	6	+++---
D	infra/gus-crawl.service	\|	13	-------------
D	infra/gus-index.service	\|	13	-------------
A	infra/update_index.sh	\|	6	++++++