From 2784977e4613260a3e40a1e6de412ce276e50467 Mon Sep 17 00:00:00 2001 From: Kevin Wallace Date: Thu, 9 May 2019 00:16:02 -0700 Subject: Initial commit --- .gitmodules | 6 +++ .htaccess | 1 + Makefile | 58 +++++++++++++++++++++++ index.php | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ merge.sh | 75 ++++++++++++++++++++++++++++++ nutrimatic | 1 + wikiextractor | 1 + 7 files changed, 287 insertions(+) create mode 100644 .gitmodules create mode 100644 .htaccess create mode 100644 Makefile create mode 100644 index.php create mode 100755 merge.sh create mode 160000 nutrimatic create mode 160000 wikiextractor diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..eb4e7ca --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "nutrimatic"] + path = nutrimatic + url = https://github.com/egnor/nutrimatic +[submodule "wikiextractor"] + path = wikiextractor + url = https://github.com/attardi/wikiextractor diff --git a/.htaccess b/.htaccess new file mode 100644 index 0000000..ed19066 --- /dev/null +++ b/.htaccess @@ -0,0 +1 @@ +RedirectMatch 404 /\.git diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3c95593 --- /dev/null +++ b/Makefile @@ -0,0 +1,58 @@ +PATH := $(PATH):/usr/local/bin + +.PHONY: all +all: enwiktionary-latest.version +all: enwiktionary-latest.index +all: enwiki-latest.version +all: enwiki-latest.index + +.PHONY: newest +newest: + rm -f *.version && make + +.PHONY: reindex +reindex: + rm -f *.index && make + +.PHONY: sync +sync: newest + rsync -av *.index tilde:public_html/pzzl.org/nut/ + +.PHONY: trim +trim: + ls enwiki-*-snap.index | sort | head -n -2 | xargs rm + ls enwiktionary-*-snap.index | sort | head -n -2 | xargs rm + +%/.git: + git submodule update --init $* + +nutrimatic/bin: nutrimatic/.git + cd nutrimatic && ./build.py + +wikiextractor/WikiExtractor.py: wikiextractor/.git + +%-latest.version: + wget -O - https://dumps.wikimedia.org/$*/latest/$*-latest-pages-articles.xml.bz2-rss.xml |\ + xpath '//rss/channel/link/text()' |\ + sed 's/^.*\///' > $@ + +enwiki-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py + # ~7h30m + wget -O - https://dumps.wikimedia.org/enwiki/$*/enwiki-$*-pages-articles.xml.bz2 |\ + bzip2 -d |\ + python wikiextractor/WikiExtractor.py -q -o- - |\ + ./merge.sh $@ + +enwiktionary-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py + # ~1h + wget -O - https://dumps.wikimedia.org/enwiktionary/$*/enwiktionary-$*-pages-articles.xml.bz2 |\ + bzip2 -d |\ + python wikiextractor/WikiExtractor.py -q -o- - |\ + ./merge.sh $@ + +%-snap.index: %-snap.merge nutrimatic/bin + nutrimatic/bin/merge-indexes 5 $*-snap.merge $@ + +%-latest.index: %-latest.version + make $*-$(shell cat $*-latest.version)-snap.index + ln -sf $*-$(shell cat $*-latest.version)-snap.index $@ diff --git a/index.php b/index.php new file mode 100644 index 0000000..5ac793f --- /dev/null +++ b/index.php @@ -0,0 +1,145 @@ + ['pipe', 'r'], + 1 => ['pipe', 'w'], + 2 => ['pipe', 'w'], + ]; + $proc = proc_open($cmd, $descriptorspec, $pipes); + list($stdin, $stdout, $stderr) = $pipes; + fclose($stdin); + // assumption: anything written to stderr will fit in the pipe buffer + // nutrimatic only appears to write-then-exit, so It's Probably Fine + while ($line = fscanf($stdout, "%s %[^\n]s\n")) { + list($score, $text) = $line; + if ($match_fn($score, $text)) { + break; + } + } + fclose($stdout); + proc_terminate($proc); + while ($err = fgets($stderr)) { + $error_fn($err); + } + fclose($stderr); + $retval = proc_close($proc); + return $retval; +} + +function index_name($file) { + return preg_replace('/\.index$/', '', $file); +} + +$index_files = glob("*.index"); +$default_index = "enwiki-latest.index"; +$selected_index = $default_index; +foreach ($index_files as $file) { + if (index_name($file) == $_GET['idx']) { + $selected_index = $file; + } +} +$q = $_GET['q'] ?: ""; +$more = 1; +$autofocus = ' autofocus'; +if ($_GET['more']) { + $more = (int)($_GET['more']); + $autofocus = ''; +} +$max_more = 10; + +?> + + + + <?php + if ($q) { + echo htmlspecialchars($q); + echo ' » '; + } + if ($selected_index != $default_index) { + echo htmlspecialchars(index_name($selected_index)); + echo ' '; + } + echo "nut"; + ?> + + + +
+
+ nut + + /> + +
+
+ $max_more) { + echo "no"; + } else if ($q) { + ?> +
    + $more; + } + ?> +
  1. >
  2. + +
  3. + +
+ $more && $more != $max_more) { + ?> + class="more" href="? index_name($selected_index), + 'q' => $q, + 'more' => $depth, + ])?>#more">MORE + +

Hi I'm a Nutrimatic

+ + + diff --git a/merge.sh b/merge.sh new file mode 100755 index 0000000..6f764de --- /dev/null +++ b/merge.sh @@ -0,0 +1,75 @@ +#!/bin/bash -e +if [ $# != 1 ]; then + echo >&2 "usage: $0 [outfile]" + exit 1 +fi + +OUT="$1" +MERGE_INDEXES=$(realpath nutrimatic/bin/merge-indexes) +MAKE_INDEX=$(realpath nutrimatic/bin/make-index) + +# merge-indexes FDs: stdin, stdout, stderr, [in...], out +# fatal assumption: we'll never exceed MAX_ARG +MAXMERGE=$(($(ulimit -n)-4)) +CUTOFF=1 +TMPDIR=$(mktemp -d mergetmp.XXXXX) +MERGEDIR=$TMPDIR/merge +INDEXDIR=$TMPDIR/index +mkdir -p $MERGEDIR $INDEXDIR + +( + cd $MERGEDIR + m=0 + while true; do + eof=0 + if [ -e EOF ]; then + eof=1 + fi + files=($(shopt -s nullglob; ls -S *.index *.merge | tail -n $MAXMERGE)) + if [ $eof -eq 0 -a ${#files[@]} -lt 2 ]; then + sleep 1 + continue + fi + out=shard.$(printf '%05d' $m).merge + m=$((m+1)) + echo $out: ${files[@]} + $MERGE_INDEXES $CUTOFF ${files[@]} $out && rm ${files[@]} + if [ $eof -eq 1 -a ${#files[@]} -lt $MAXMERGE ]; then + mv $out ../merged + exit 0 + fi + done +) & +merge_pid=$! + +( + n=0 + while true; do + cur=$INDEXDIR/shard.$(printf '%05d' $n).index + n=$((n+1)) + next=$INDEXDIR/shard.$(printf '%05d' $n).index + eof=$INDEXDIR/EOF + while [ ! -e "$next" -a ! -e "$eof" ]; do + sleep 1 + done + echo "$cur" + mv "$cur" "$MERGEDIR/" + if [ ! -e "$next" -a -e "$eof" ]; then + mv "$eof" "$MERGEDIR/EOF" + exit 0 + fi + done +) & +copy_pid=$! + +trap "rm -rf '$TMPDIR'; kill 0; exit" INT TERM + +# consumes stdin +$MAKE_INDEX $INDEXDIR/shard +touch $INDEXDIR/EOF + +wait $copy_pid +wait $merge_pid +mv $TMPDIR/merged "$OUT" +rm -r $TMPDIR +exit 0 diff --git a/nutrimatic b/nutrimatic new file mode 160000 index 0000000..2159786 --- /dev/null +++ b/nutrimatic @@ -0,0 +1 @@ +Subproject commit 2159786044ac078cbb37cc5ba426837aa44d5909 diff --git a/wikiextractor b/wikiextractor new file mode 160000 index 0000000..3162bb6 --- /dev/null +++ b/wikiextractor @@ -0,0 +1 @@ +Subproject commit 3162bb6c3c9ebd2d15be507aa11d6fa818a454ac -- cgit v1.2.3