diff options
| author | Kevin Wallace <kevin@wallace.seattle.wa.us> | 2025-01-25 18:28:52 -0800 | 
|---|---|---|
| committer | Kevin Wallace <kevin@wallace.seattle.wa.us> | 2025-01-25 18:30:47 -0800 | 
| commit | 1f490b6ff954a433cecaa17bc74c9f2543edcebb (patch) | |
| tree | e23de458b0b938987965c3658384b802f2caa8b7 | |
| parent | add phraser words500K and phrases5M lists (diff) | |
use wikiextractor from local env, fetch intermediate .bz2, `xpath -e`
| -rw-r--r-- | .gitmodules | 3 | ||||
| -rw-r--r-- | Makefile | 31 | ||||
| m--------- | wikiextractor | 0 | 
3 files changed, 16 insertions, 18 deletions
| diff --git a/.gitmodules b/.gitmodules index eb4e7ca..8f00840 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@  [submodule "nutrimatic"]  	path = nutrimatic  	url = https://github.com/egnor/nutrimatic -[submodule "wikiextractor"] -	path = wikiextractor -	url = https://github.com/attardi/wikiextractor @@ -22,8 +22,10 @@ sync: newest  .PHONY: trim  trim: -	ls enwiki-*-snap.index | sort | head -n -2 | xargs -rt rm -	ls enwiktionary-*-snap.index | sort | head -n -2 | xargs -rt rm +	ls enwiki-*-snap.index | sort | head -n -2 | xargs -t rm +	ls enwiktionary-*-snap.index | sort | head -n -2 | xargs -t rm +	ls enwiki-*-pages-articles.xml.bz2 | sort | head -n -2 | xargs -t rm +	ls enwiktionary-*-pages-articles.xml.bz2 | sort | head -n -2 | xargs -t rm  %/.git:  	git submodule update --init $* @@ -31,25 +33,24 @@ trim:  nutrimatic/bin: nutrimatic/.git  	cd nutrimatic && ./build.py -wikiextractor/WikiExtractor.py: wikiextractor/.git -  %-latest.version:  	wget -O - https://dumps.wikimedia.org/$*/latest/$*-latest-pages-articles.xml.bz2-rss.xml |\ -		xpath '//rss/channel/link/text()' |\ +		xpath -e '//rss/channel/link/text()' |\  		sed 's/^.*\///' > $@ -enwiki-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py -	# ~7h30m -	wget -O - https://dumps.wikimedia.org/enwiki/$*/enwiki-$*-pages-articles.xml.bz2 |\ -		bzip2 -d |\ -		python wikiextractor/WikiExtractor.py -q -o- - |\ -		./merge.sh $@ +enwiki-%-pages-articles.xml.bz2: +	wget -O $@ https://dumps.wikimedia.org/enwiki/$*/enwiki-$*-pages-articles.xml.bz2 -enwiktionary-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py +enwiki-%-snap.merge: nutrimatic/bin enwiki-%-pages-articles.xml.bz2  	# ~1h -	wget -O - https://dumps.wikimedia.org/enwiktionary/$*/enwiktionary-$*-pages-articles.xml.bz2 |\ -		bzip2 -d |\ -		python wikiextractor/WikiExtractor.py -q -o- - |\ +	wikiextractor -q -o- enwiki-$*-pages-articles.xml.bz2 |\ +		./merge.sh $@ + +enwiktionary-%-pages-articles.xml.bz2: +	wget -O $@ https://dumps.wikimedia.org/enwiktionary/$*/enwiktionary-$*-pages-articles.xml.bz2 + +enwiktionary-%-snap.merge: nutrimatic/bin enwiktionary-%-pages-articles.xml.bz2 +	wikiextractor -q -o- enwiktionary-$*-pages-articles.xml.bz2 |\  		./merge.sh $@  %-snap.index: %-snap.merge nutrimatic/bin diff --git a/wikiextractor b/wikiextractor deleted file mode 160000 -Subproject 3162bb6c3c9ebd2d15be507aa11d6fa818a454a |