summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitmodules3
-rw-r--r--Makefile31
m---------wikiextractor0
3 files changed, 16 insertions, 18 deletions
diff --git a/.gitmodules b/.gitmodules
index eb4e7ca..8f00840 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
[submodule "nutrimatic"]
path = nutrimatic
url = https://github.com/egnor/nutrimatic
-[submodule "wikiextractor"]
- path = wikiextractor
- url = https://github.com/attardi/wikiextractor
diff --git a/Makefile b/Makefile
index 398a518..95d78f5 100644
--- a/Makefile
+++ b/Makefile
@@ -22,8 +22,10 @@ sync: newest
.PHONY: trim
trim:
- ls enwiki-*-snap.index | sort | head -n -2 | xargs -rt rm
- ls enwiktionary-*-snap.index | sort | head -n -2 | xargs -rt rm
+ ls enwiki-*-snap.index | sort | head -n -2 | xargs -t rm
+ ls enwiktionary-*-snap.index | sort | head -n -2 | xargs -t rm
+ ls enwiki-*-pages-articles.xml.bz2 | sort | head -n -2 | xargs -t rm
+ ls enwiktionary-*-pages-articles.xml.bz2 | sort | head -n -2 | xargs -t rm
%/.git:
git submodule update --init $*
@@ -31,25 +33,24 @@ trim:
nutrimatic/bin: nutrimatic/.git
cd nutrimatic && ./build.py
-wikiextractor/WikiExtractor.py: wikiextractor/.git
-
%-latest.version:
wget -O - https://dumps.wikimedia.org/$*/latest/$*-latest-pages-articles.xml.bz2-rss.xml |\
- xpath '//rss/channel/link/text()' |\
+ xpath -e '//rss/channel/link/text()' |\
sed 's/^.*\///' > $@
-enwiki-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py
- # ~7h30m
- wget -O - https://dumps.wikimedia.org/enwiki/$*/enwiki-$*-pages-articles.xml.bz2 |\
- bzip2 -d |\
- python wikiextractor/WikiExtractor.py -q -o- - |\
- ./merge.sh $@
+enwiki-%-pages-articles.xml.bz2:
+ wget -O $@ https://dumps.wikimedia.org/enwiki/$*/enwiki-$*-pages-articles.xml.bz2
-enwiktionary-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py
+enwiki-%-snap.merge: nutrimatic/bin enwiki-%-pages-articles.xml.bz2
# ~1h
- wget -O - https://dumps.wikimedia.org/enwiktionary/$*/enwiktionary-$*-pages-articles.xml.bz2 |\
- bzip2 -d |\
- python wikiextractor/WikiExtractor.py -q -o- - |\
+ wikiextractor -q -o- enwiki-$*-pages-articles.xml.bz2 |\
+ ./merge.sh $@
+
+enwiktionary-%-pages-articles.xml.bz2:
+ wget -O $@ https://dumps.wikimedia.org/enwiktionary/$*/enwiktionary-$*-pages-articles.xml.bz2
+
+enwiktionary-%-snap.merge: nutrimatic/bin enwiktionary-%-pages-articles.xml.bz2
+ wikiextractor -q -o- enwiktionary-$*-pages-articles.xml.bz2 |\
./merge.sh $@
%-snap.index: %-snap.merge nutrimatic/bin
diff --git a/wikiextractor b/wikiextractor
deleted file mode 160000
-Subproject 3162bb6c3c9ebd2d15be507aa11d6fa818a454a