summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Wallace <kevin@wallace.seattle.wa.us>2025-01-25 20:54:42 -0800
committerKevin Wallace <kevin@wallace.seattle.wa.us>2025-01-25 21:02:49 -0800
commitc88b9d130896c0786c048bc6d3c974520f261f4a (patch)
tree7de29c66f1e1e18156dcf42c4688b838cd62ed15
parentuse wikiextractor from local env, fetch intermediate .bz2, `xpath -e` (diff)
version phraser indexesHEADmaster
-rw-r--r--Makefile45
1 files changed, 31 insertions, 14 deletions
diff --git a/Makefile b/Makefile
index 95d78f5..5d399bf 100644
--- a/Makefile
+++ b/Makefile
@@ -5,8 +5,8 @@ all: enwiktionary-latest.version
all: enwiktionary-latest.index
all: enwiki-latest.version
all: enwiki-latest.index
-all: phraser-words500K.index
-all: phraser-phrases5M.index
+all: phraser-words500K-latest.index
+all: phraser-phrases5M-latest.index
.PHONY: newest
newest:
@@ -26,6 +26,9 @@ trim:
ls enwiktionary-*-snap.index | sort | head -n -2 | xargs -t rm
ls enwiki-*-pages-articles.xml.bz2 | sort | head -n -2 | xargs -t rm
ls enwiktionary-*-pages-articles.xml.bz2 | sort | head -n -2 | xargs -t rm
+ ls phraser-words500K-*-snap.index | sort | head -n -2 | xargs -t rm
+ ls phraser-phrases5M-*-snap.index | sort | head -n -2 | xargs -t rm
+ ls phraser-*.txt | sort | head -n -2 | xargs -t rm
%/.git:
git submodule update --init $*
@@ -33,8 +36,8 @@ trim:
nutrimatic/bin: nutrimatic/.git
cd nutrimatic && ./build.py
-%-latest.version:
- wget -O - https://dumps.wikimedia.org/$*/latest/$*-latest-pages-articles.xml.bz2-rss.xml |\
+en%-latest.version:
+ wget -O - https://dumps.wikimedia.org/en$*/latest/en$*-latest-pages-articles.xml.bz2-rss.xml |\
xpath -e '//rss/channel/link/text()' |\
sed 's/^.*\///' > $@
@@ -53,21 +56,35 @@ enwiktionary-%-snap.merge: nutrimatic/bin enwiktionary-%-pages-articles.xml.bz2
wikiextractor -q -o- enwiktionary-$*-pages-articles.xml.bz2 |\
./merge.sh $@
-%-snap.index: %-snap.merge nutrimatic/bin
- nutrimatic/bin/merge-indexes 5 $*-snap.merge $@
+en%-snap.index: en%-snap.merge nutrimatic/bin
+ nutrimatic/bin/merge-indexes 5 en$*-snap.merge $@
%-latest.index: %-latest.version
make $*-$(shell cat $*-latest.version)-snap.index
ln -sf $*-$(shell cat $*-latest.version)-snap.index $@
-phraser-words500K.index: nutrimatic/bin
+phraser-words500K-latest.version:
+ date -d \
+ "$$(curl -I https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/words_500K.txt |\
+ awk -F': ' '/^last-modified: / { print $$2 }')" \
+ +"%Y%m%d%H%M%S" > $@
+
+phraser-phrases5M-latest.version:
+ date -d \
+ "$$(curl -I https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/phrases_5M.txt |\
+ awk -F': ' '/^last-modified: / { print $$2 }')" \
+ +"%Y%m%d%H%M%S" > $@
+
+phraser-words500K-%.txt:
+ wget -O $@ https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/words_500K.txt
+
+phraser-phrases5M-%.txt:
+ wget -O $@ https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/phrases_5M.txt
+
+phraser-words500K-%-snap.index: phraser-words500K-%.txt nutrimatic/bin
# ~ 3m
- wget -O - https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/words_500K.txt |\
- awk -F'\t' '{ for (i=0; i < $$1; i++) print $$2 }' |\
- ./merge.sh $@
+ awk -F'\t' '{ for (i=0; i < $$1; i++) print $$2 }' phraser-words500K-$*.txt | ./merge.sh $@
-phraser-phrases5M.index: nutrimatic/bin
+phraser-phrases5M-%-snap.index: phraser-phrases5M-%.txt nutrimatic/bin
# ~ 10m
- wget -O - https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/phrases_5M.txt |\
- awk -F'\t' '{ for (i=0; i < $$1; i++) print $$2 }' |\
- ./merge.sh $@
+ awk -F'\t' '{ for (i=0; i < $$1; i++) print $$2 }' phraser-phrases5M-$*.txt | ./merge.sh $@