diff options
author | Kevin Wallace <kevin@wallace.seattle.wa.us> | 2025-01-25 20:54:42 -0800 |
---|---|---|
committer | Kevin Wallace <kevin@wallace.seattle.wa.us> | 2025-01-25 21:02:49 -0800 |
commit | c88b9d130896c0786c048bc6d3c974520f261f4a (patch) | |
tree | 7de29c66f1e1e18156dcf42c4688b838cd62ed15 | |
parent | use wikiextractor from local env, fetch intermediate .bz2, `xpath -e` (diff) |
-rw-r--r-- | Makefile | 45 |
1 files changed, 31 insertions, 14 deletions
@@ -5,8 +5,8 @@ all: enwiktionary-latest.version all: enwiktionary-latest.index all: enwiki-latest.version all: enwiki-latest.index -all: phraser-words500K.index -all: phraser-phrases5M.index +all: phraser-words500K-latest.index +all: phraser-phrases5M-latest.index .PHONY: newest newest: @@ -26,6 +26,9 @@ trim: ls enwiktionary-*-snap.index | sort | head -n -2 | xargs -t rm ls enwiki-*-pages-articles.xml.bz2 | sort | head -n -2 | xargs -t rm ls enwiktionary-*-pages-articles.xml.bz2 | sort | head -n -2 | xargs -t rm + ls phraser-words500K-*-snap.index | sort | head -n -2 | xargs -t rm + ls phraser-phrases5M-*-snap.index | sort | head -n -2 | xargs -t rm + ls phraser-*.txt | sort | head -n -2 | xargs -t rm %/.git: git submodule update --init $* @@ -33,8 +36,8 @@ trim: nutrimatic/bin: nutrimatic/.git cd nutrimatic && ./build.py -%-latest.version: - wget -O - https://dumps.wikimedia.org/$*/latest/$*-latest-pages-articles.xml.bz2-rss.xml |\ +en%-latest.version: + wget -O - https://dumps.wikimedia.org/en$*/latest/en$*-latest-pages-articles.xml.bz2-rss.xml |\ xpath -e '//rss/channel/link/text()' |\ sed 's/^.*\///' > $@ @@ -53,21 +56,35 @@ enwiktionary-%-snap.merge: nutrimatic/bin enwiktionary-%-pages-articles.xml.bz2 wikiextractor -q -o- enwiktionary-$*-pages-articles.xml.bz2 |\ ./merge.sh $@ -%-snap.index: %-snap.merge nutrimatic/bin - nutrimatic/bin/merge-indexes 5 $*-snap.merge $@ +en%-snap.index: en%-snap.merge nutrimatic/bin + nutrimatic/bin/merge-indexes 5 en$*-snap.merge $@ %-latest.index: %-latest.version make $*-$(shell cat $*-latest.version)-snap.index ln -sf $*-$(shell cat $*-latest.version)-snap.index $@ -phraser-words500K.index: nutrimatic/bin +phraser-words500K-latest.version: + date -d \ + "$$(curl -I https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/words_500K.txt |\ + awk -F': ' '/^last-modified: / { print $$2 }')" \ + +"%Y%m%d%H%M%S" > $@ + +phraser-phrases5M-latest.version: + date -d \ + "$$(curl -I https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/phrases_5M.txt |\ + awk -F': ' '/^last-modified: / { print $$2 }')" \ + +"%Y%m%d%H%M%S" > $@ + +phraser-words500K-%.txt: + wget -O $@ https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/words_500K.txt + +phraser-phrases5M-%.txt: + wget -O $@ https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/phrases_5M.txt + +phraser-words500K-%-snap.index: phraser-words500K-%.txt nutrimatic/bin # ~ 3m - wget -O - https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/words_500K.txt |\ - awk -F'\t' '{ for (i=0; i < $$1; i++) print $$2 }' |\ - ./merge.sh $@ + awk -F'\t' '{ for (i=0; i < $$1; i++) print $$2 }' phraser-words500K-$*.txt | ./merge.sh $@ -phraser-phrases5M.index: nutrimatic/bin +phraser-phrases5M-%-snap.index: phraser-phrases5M-%.txt nutrimatic/bin # ~ 10m - wget -O - https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/phrases_5M.txt |\ - awk -F'\t' '{ for (i=0; i < $$1; i++) print $$2 }' |\ - ./merge.sh $@ + awk -F'\t' '{ for (i=0; i < $$1; i++) print $$2 }' phraser-phrases5M-$*.txt | ./merge.sh $@ |