summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitmodules3
-rw-r--r--Makefile70
m---------wikiextractor0
3 files changed, 51 insertions, 22 deletions
diff --git a/.gitmodules b/.gitmodules
index eb4e7ca..8f00840 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
[submodule "nutrimatic"]
path = nutrimatic
url = https://github.com/egnor/nutrimatic
-[submodule "wikiextractor"]
- path = wikiextractor
- url = https://github.com/attardi/wikiextractor
diff --git a/Makefile b/Makefile
index 9d52f79..5d399bf 100644
--- a/Makefile
+++ b/Makefile
@@ -5,6 +5,8 @@ all: enwiktionary-latest.version
all: enwiktionary-latest.index
all: enwiki-latest.version
all: enwiki-latest.index
+all: phraser-words500K-latest.index
+all: phraser-phrases5M-latest.index
.PHONY: newest
newest:
@@ -20,8 +22,13 @@ sync: newest
.PHONY: trim
trim:
- ls enwiki-*-snap.index | sort | head -n -2 | xargs -rt rm
- ls enwiktionary-*-snap.index | sort | head -n -2 | xargs -rt rm
+ ls enwiki-*-snap.index | sort | head -n -2 | xargs -t rm
+ ls enwiktionary-*-snap.index | sort | head -n -2 | xargs -t rm
+ ls enwiki-*-pages-articles.xml.bz2 | sort | head -n -2 | xargs -t rm
+ ls enwiktionary-*-pages-articles.xml.bz2 | sort | head -n -2 | xargs -t rm
+ ls phraser-words500K-*-snap.index | sort | head -n -2 | xargs -t rm
+ ls phraser-phrases5M-*-snap.index | sort | head -n -2 | xargs -t rm
+ ls phraser-*.txt | sort | head -n -2 | xargs -t rm
%/.git:
git submodule update --init $*
@@ -29,30 +36,55 @@ trim:
nutrimatic/bin: nutrimatic/.git
cd nutrimatic && ./build.py
-wikiextractor/WikiExtractor.py: wikiextractor/.git
-
-%-latest.version:
- wget -O - https://dumps.wikimedia.org/$*/latest/$*-latest-pages-articles.xml.bz2-rss.xml |\
- xpath '//rss/channel/link/text()' |\
+en%-latest.version:
+ wget -O - https://dumps.wikimedia.org/en$*/latest/en$*-latest-pages-articles.xml.bz2-rss.xml |\
+ xpath -e '//rss/channel/link/text()' |\
sed 's/^.*\///' > $@
-enwiki-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py
- # ~7h30m
- wget -O - https://dumps.wikimedia.org/enwiki/$*/enwiki-$*-pages-articles.xml.bz2 |\
- bzip2 -d |\
- python wikiextractor/WikiExtractor.py -q -o- - |\
- ./merge.sh $@
+enwiki-%-pages-articles.xml.bz2:
+ wget -O $@ https://dumps.wikimedia.org/enwiki/$*/enwiki-$*-pages-articles.xml.bz2
-enwiktionary-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py
+enwiki-%-snap.merge: nutrimatic/bin enwiki-%-pages-articles.xml.bz2
# ~1h
- wget -O - https://dumps.wikimedia.org/enwiktionary/$*/enwiktionary-$*-pages-articles.xml.bz2 |\
- bzip2 -d |\
- python wikiextractor/WikiExtractor.py -q -o- - |\
+ wikiextractor -q -o- enwiki-$*-pages-articles.xml.bz2 |\
+ ./merge.sh $@
+
+enwiktionary-%-pages-articles.xml.bz2:
+ wget -O $@ https://dumps.wikimedia.org/enwiktionary/$*/enwiktionary-$*-pages-articles.xml.bz2
+
+enwiktionary-%-snap.merge: nutrimatic/bin enwiktionary-%-pages-articles.xml.bz2
+ wikiextractor -q -o- enwiktionary-$*-pages-articles.xml.bz2 |\
./merge.sh $@
-%-snap.index: %-snap.merge nutrimatic/bin
- nutrimatic/bin/merge-indexes 5 $*-snap.merge $@
+en%-snap.index: en%-snap.merge nutrimatic/bin
+ nutrimatic/bin/merge-indexes 5 en$*-snap.merge $@
%-latest.index: %-latest.version
make $*-$(shell cat $*-latest.version)-snap.index
ln -sf $*-$(shell cat $*-latest.version)-snap.index $@
+
+phraser-words500K-latest.version:
+ date -d \
+ "$$(curl -I https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/words_500K.txt |\
+ awk -F': ' '/^last-modified: / { print $$2 }')" \
+ +"%Y%m%d%H%M%S" > $@
+
+phraser-phrases5M-latest.version:
+ date -d \
+ "$$(curl -I https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/phrases_5M.txt |\
+ awk -F': ' '/^last-modified: / { print $$2 }')" \
+ +"%Y%m%d%H%M%S" > $@
+
+phraser-words500K-%.txt:
+ wget -O $@ https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/words_500K.txt
+
+phraser-phrases5M-%.txt:
+ wget -O $@ https://lahosken.san-francisco.ca.us/frivolity/prog/phraser/phrases_5M.txt
+
+phraser-words500K-%-snap.index: phraser-words500K-%.txt nutrimatic/bin
+ # ~ 3m
+ awk -F'\t' '{ for (i=0; i < $$1; i++) print $$2 }' phraser-words500K-$*.txt | ./merge.sh $@
+
+phraser-phrases5M-%-snap.index: phraser-phrases5M-%.txt nutrimatic/bin
+ # ~ 10m
+ awk -F'\t' '{ for (i=0; i < $$1; i++) print $$2 }' phraser-phrases5M-$*.txt | ./merge.sh $@
diff --git a/wikiextractor b/wikiextractor
deleted file mode 160000
-Subproject 3162bb6c3c9ebd2d15be507aa11d6fa818a454a