commit c562254a65a505dc9f5e5c54cb8ee6be810c644a
parent 6ea586ae2380fcdedf5c83d5ee99438370390cad
Author: René Wagner <rwa@clttr.info>
Date: Fri, 6 Jan 2023 11:55:41 +0100
fix crawl, revamp crawl & indexing procedure
Diffstat:
6 files changed, 16 insertions(+), 33 deletions(-)
diff --git a/README.md b/README.md
@@ -38,9 +38,7 @@ Now you'll have created `index.new` directory, rename it to `index`.
### Running the crawl & indexer in production with systemd
-1. update `infra/gus-crawl.service` & `infra/gus-index.service` to match your needs (directory, user)
-2. copy both files to `/etc/systemd/system/`
-3. set up a cron job for root with the following params: `0 9 */3 * * systemctl start gus-crawl --no-block`
+3. set up a cron job with the following params: `0 9 * * * /home/gus/infra/update-index.sh /home/gus`
## Running the test suite
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -514,8 +514,13 @@ def crawl_page(
def load_expired_urls():
- expired_pages = Page.select(Page.ur).where(last_crawl_at < (datetime.now() + timedelta(hours=(Page.change_frequency * -1))) & Page.last_crawl_at.is_null(True))
+ expired_pages = Page.raw(
+ """SELECT p.url
+ FROM page as p
+ WHERE datetime(last_crawl_at, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now') OR last_crawl_at IS NULL""" )
return [page.url for page in expired_pages.execute()]
+# expired_pages = Page.select(Page.url).where(Page.last_crawl_at < (datetime.now() - timedelta(hours=Page.change_frequency)) & Page.last_crawl_at.is_null(True))
+# return expired_pages
def load_seed_request_urls():
with open("seed-requests.txt") as f:
diff --git a/infra/filter_seed_request.sh b/infra/filter_seed_request.sh
@@ -1,3 +1,3 @@
-cd /home/gus/
-cat seed-requests.txt | grep -iE '^gemini:\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,10}\b[-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*$' | sort | uniq > seed-requests.txt2
-mv seed-requests.txt2 seed-requests.txt
+BASEDIR=${1}
+cat ${BASEDIR}/seed-requests.txt | grep -iE '^gemini:\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,10}\b[-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*$' | sort | uniq > ${BASEDIR}/seed-requests.txt2
+mv ${BASEDIR}/seed-requests.txt2 ${BASEDIR}/seed-requests.txt
diff --git a/infra/gus-crawl.service b/infra/gus-crawl.service
@@ -1,13 +0,0 @@
-# /etc/systemd/system/gus.service
-
-[Unit]
-Description=Gemini Universal Search - Crawler
-
-[Service]
-User=gus
-Group=gus
-Type=oneshot
-WorkingDirectory=/home/gus/
-Environment="PYTHONUNBUFFERED=1"
-ExecStart=/home/gus/.poetry/bin/poetry run crawl
-ExecStopPost=sudo systemctl start gus-index --no-block
diff --git a/infra/gus-index.service b/infra/gus-index.service
@@ -1,13 +0,0 @@
-# /etc/systemd/system/gus.service
-
-[Unit]
-Description=Gemini Universal Search - Indexer
-
-[Service]
-User=gus
-Group=gus
-Type=oneshot
-WorkingDirectory=/home/gus
-Environment="PYTHONUNBUFFERED=1"
-ExecStart=/home/gus/.poetry/bin/poetry run build_index
-ExecStopPost=sudo systemctl restart gus
diff --git a/infra/update_index.sh b/infra/update_index.sh
@@ -0,0 +1,6 @@
+BASEDIR=${1}
+rm ${BASEDIR}/logs/gus.log
+${BASEDIR}/infra/filter_seed_request.sh ${BASEDIR}
+${BASEDIR}/.poetry/bin/poetry run crawl
+${BASEDIR}/.poetry/bin/poetry run build_index
+sudo systemctl restart gus