diff options
author | Kevin Wallace <kevin@wallace.seattle.wa.us> | 2025-01-25 18:28:52 -0800 |
---|---|---|
committer | Kevin Wallace <kevin@wallace.seattle.wa.us> | 2025-01-25 18:30:47 -0800 |
commit | 1f490b6ff954a433cecaa17bc74c9f2543edcebb (patch) | |
tree | e23de458b0b938987965c3658384b802f2caa8b7 /Makefile | |
parent | add phraser words500K and phrases5M lists (diff) |
use wikiextractor from local env, fetch intermediate .bz2, `xpath -e`
Diffstat (limited to 'Makefile')
-rw-r--r-- | Makefile | 31 |
1 files changed, 16 insertions, 15 deletions
@@ -22,8 +22,10 @@ sync: newest .PHONY: trim trim: - ls enwiki-*-snap.index | sort | head -n -2 | xargs -rt rm - ls enwiktionary-*-snap.index | sort | head -n -2 | xargs -rt rm + ls enwiki-*-snap.index | sort | head -n -2 | xargs -t rm + ls enwiktionary-*-snap.index | sort | head -n -2 | xargs -t rm + ls enwiki-*-pages-articles.xml.bz2 | sort | head -n -2 | xargs -t rm + ls enwiktionary-*-pages-articles.xml.bz2 | sort | head -n -2 | xargs -t rm %/.git: git submodule update --init $* @@ -31,25 +33,24 @@ trim: nutrimatic/bin: nutrimatic/.git cd nutrimatic && ./build.py -wikiextractor/WikiExtractor.py: wikiextractor/.git - %-latest.version: wget -O - https://dumps.wikimedia.org/$*/latest/$*-latest-pages-articles.xml.bz2-rss.xml |\ - xpath '//rss/channel/link/text()' |\ + xpath -e '//rss/channel/link/text()' |\ sed 's/^.*\///' > $@ -enwiki-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py - # ~7h30m - wget -O - https://dumps.wikimedia.org/enwiki/$*/enwiki-$*-pages-articles.xml.bz2 |\ - bzip2 -d |\ - python wikiextractor/WikiExtractor.py -q -o- - |\ - ./merge.sh $@ +enwiki-%-pages-articles.xml.bz2: + wget -O $@ https://dumps.wikimedia.org/enwiki/$*/enwiki-$*-pages-articles.xml.bz2 -enwiktionary-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py +enwiki-%-snap.merge: nutrimatic/bin enwiki-%-pages-articles.xml.bz2 # ~1h - wget -O - https://dumps.wikimedia.org/enwiktionary/$*/enwiktionary-$*-pages-articles.xml.bz2 |\ - bzip2 -d |\ - python wikiextractor/WikiExtractor.py -q -o- - |\ + wikiextractor -q -o- enwiki-$*-pages-articles.xml.bz2 |\ + ./merge.sh $@ + +enwiktionary-%-pages-articles.xml.bz2: + wget -O $@ https://dumps.wikimedia.org/enwiktionary/$*/enwiktionary-$*-pages-articles.xml.bz2 + +enwiktionary-%-snap.merge: nutrimatic/bin enwiktionary-%-pages-articles.xml.bz2 + wikiextractor -q -o- enwiktionary-$*-pages-articles.xml.bz2 |\ ./merge.sh $@ %-snap.index: %-snap.merge nutrimatic/bin |