diff options
author | Kevin Wallace <kevin@wallace.seattle.wa.us> | 2019-05-09 00:16:02 -0700 |
---|---|---|
committer | Kevin Wallace <kevin@wallace.seattle.wa.us> | 2019-05-09 00:18:46 -0700 |
commit | 2784977e4613260a3e40a1e6de412ce276e50467 (patch) | |
tree | 9d5adc0b75996943803d8c594361de70fb8efe5b |
Initial commit
-rw-r--r-- | .gitmodules | 6 | ||||
-rw-r--r-- | .htaccess | 1 | ||||
-rw-r--r-- | Makefile | 58 | ||||
-rw-r--r-- | index.php | 145 | ||||
-rwxr-xr-x | merge.sh | 75 | ||||
m--------- | nutrimatic | 0 | ||||
m--------- | wikiextractor | 0 |
7 files changed, 285 insertions, 0 deletions
diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..eb4e7ca --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "nutrimatic"] + path = nutrimatic + url = https://github.com/egnor/nutrimatic +[submodule "wikiextractor"] + path = wikiextractor + url = https://github.com/attardi/wikiextractor diff --git a/.htaccess b/.htaccess new file mode 100644 index 0000000..ed19066 --- /dev/null +++ b/.htaccess @@ -0,0 +1 @@ +RedirectMatch 404 /\.git diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3c95593 --- /dev/null +++ b/Makefile @@ -0,0 +1,58 @@ +PATH := $(PATH):/usr/local/bin + +.PHONY: all +all: enwiktionary-latest.version +all: enwiktionary-latest.index +all: enwiki-latest.version +all: enwiki-latest.index + +.PHONY: newest +newest: + rm -f *.version && make + +.PHONY: reindex +reindex: + rm -f *.index && make + +.PHONY: sync +sync: newest + rsync -av *.index tilde:public_html/pzzl.org/nut/ + +.PHONY: trim +trim: + ls enwiki-*-snap.index | sort | head -n -2 | xargs rm + ls enwiktionary-*-snap.index | sort | head -n -2 | xargs rm + +%/.git: + git submodule update --init $* + +nutrimatic/bin: nutrimatic/.git + cd nutrimatic && ./build.py + +wikiextractor/WikiExtractor.py: wikiextractor/.git + +%-latest.version: + wget -O - https://dumps.wikimedia.org/$*/latest/$*-latest-pages-articles.xml.bz2-rss.xml |\ + xpath '//rss/channel/link/text()' |\ + sed 's/^.*\///' > $@ + +enwiki-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py + # ~7h30m + wget -O - https://dumps.wikimedia.org/enwiki/$*/enwiki-$*-pages-articles.xml.bz2 |\ + bzip2 -d |\ + python wikiextractor/WikiExtractor.py -q -o- - |\ + ./merge.sh $@ + +enwiktionary-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py + # ~1h + wget -O - https://dumps.wikimedia.org/enwiktionary/$*/enwiktionary-$*-pages-articles.xml.bz2 |\ + bzip2 -d |\ + python wikiextractor/WikiExtractor.py -q -o- - |\ + ./merge.sh $@ + +%-snap.index: %-snap.merge nutrimatic/bin + nutrimatic/bin/merge-indexes 5 $*-snap.merge $@ + +%-latest.index: %-latest.version + make $*-$(shell cat $*-latest.version)-snap.index + ln -sf $*-$(shell cat $*-latest.version)-snap.index $@ diff --git a/index.php b/index.php new file mode 100644 index 0000000..5ac793f --- /dev/null +++ b/index.php @@ -0,0 +1,145 @@ +<?php + +// find_expr runs nutrimatic's find-expr, calling $match_fn for each match. +// if $match_fn returns a non-false value, the search will stop +function find_expr($index, $expr, $match_fn, $error_fn) { + $cmd = implode(' ', array_map('escapeshellarg', [ + "./nutrimatic/bin/find-expr", + $index, + $expr, + ])); + $cmd = "ulimit -t 30; exec $cmd"; + $descriptorspec = [ + 0 => ['pipe', 'r'], + 1 => ['pipe', 'w'], + 2 => ['pipe', 'w'], + ]; + $proc = proc_open($cmd, $descriptorspec, $pipes); + list($stdin, $stdout, $stderr) = $pipes; + fclose($stdin); + // assumption: anything written to stderr will fit in the pipe buffer + // nutrimatic only appears to write-then-exit, so It's Probably Fine + while ($line = fscanf($stdout, "%s %[^\n]s\n")) { + list($score, $text) = $line; + if ($match_fn($score, $text)) { + break; + } + } + fclose($stdout); + proc_terminate($proc); + while ($err = fgets($stderr)) { + $error_fn($err); + } + fclose($stderr); + $retval = proc_close($proc); + return $retval; +} + +function index_name($file) { + return preg_replace('/\.index$/', '', $file); +} + +$index_files = glob("*.index"); +$default_index = "enwiki-latest.index"; +$selected_index = $default_index; +foreach ($index_files as $file) { + if (index_name($file) == $_GET['idx']) { + $selected_index = $file; + } +} +$q = $_GET['q'] ?: ""; +$more = 1; +$autofocus = ' autofocus'; +if ($_GET['more']) { + $more = (int)($_GET['more']); + $autofocus = ''; +} +$max_more = 10; + +?> +<html> +<head> + <meta name="viewport" content="initial-scale=1.0"> + <title><?php + if ($q) { + echo htmlspecialchars($q); + echo ' » '; + } + if ($selected_index != $default_index) { + echo htmlspecialchars(index_name($selected_index)); + echo ' '; + } + echo "nut"; + ?></title> +<style> + +</style> +</head> +<body> + <form> + <fieldset> + <legend><a href=".">nut</a></legend> + <select name="idx"> + <?php + foreach ($index_files as $file) { + $name = index_name($file); + $attrs = ''; + if ($file == $selected_index) { + $attrs .= ' selected'; + } + ?> + <option<?= $attrs ?> value="<?=htmlspecialchars($name)?>"><?=htmlspecialchars($name)?></option> + <?php + } + ?> + </select> + <input type="text" name="q" value="<?=htmlspecialchars($q)?>" <?=$autofocus?>/> + <input type="submit" /> + </fieldset> + </form> + <?php + if ($more > $max_more) { + echo "no"; + } else if ($q) { + ?> + <ol> + <?php + $depth = 0; + $anchor = ''; + find_expr($selected_index, $q, function($score, $text) use (&$depth, &$anchor, $more) { + if ($score == '#') { + $depth++; + if ($depth == $more) { + $anchor .= ' id="more"'; + } + return $depth > $more; + } + ?> + <li class="match" value="<?=(int)(float)($score)?>"<?=$anchor?>><?=htmlspecialchars($text)?></li> + <?php + $anchor = ''; + }, function($err) { + ?> + <li class="error" value="0"><?=htmlspecialchars($err)?></li> + <?php + }); + ?> + </ol> + <?php + if ($depth > $more && $more != $max_more) { + ?> + <a<?=$anchor?> class="more" href="?<?=http_build_query([ + 'idx' => index_name($selected_index), + 'q' => $q, + 'more' => $depth, + ])?>#more">MORE</a> + <?php + } + } else { + ?> + <p>Hi I'm a <a href="//nutrimatic.org/">Nutrimatic</a></p> + <?php + } + ?> +</body> +</html> diff --git a/merge.sh b/merge.sh new file mode 100755 index 0000000..6f764de --- /dev/null +++ b/merge.sh @@ -0,0 +1,75 @@ +#!/bin/bash -e +if [ $# != 1 ]; then + echo >&2 "usage: $0 [outfile]" + exit 1 +fi + +OUT="$1" +MERGE_INDEXES=$(realpath nutrimatic/bin/merge-indexes) +MAKE_INDEX=$(realpath nutrimatic/bin/make-index) + +# merge-indexes FDs: stdin, stdout, stderr, [in...], out +# fatal assumption: we'll never exceed MAX_ARG +MAXMERGE=$(($(ulimit -n)-4)) +CUTOFF=1 +TMPDIR=$(mktemp -d mergetmp.XXXXX) +MERGEDIR=$TMPDIR/merge +INDEXDIR=$TMPDIR/index +mkdir -p $MERGEDIR $INDEXDIR + +( + cd $MERGEDIR + m=0 + while true; do + eof=0 + if [ -e EOF ]; then + eof=1 + fi + files=($(shopt -s nullglob; ls -S *.index *.merge | tail -n $MAXMERGE)) + if [ $eof -eq 0 -a ${#files[@]} -lt 2 ]; then + sleep 1 + continue + fi + out=shard.$(printf '%05d' $m).merge + m=$((m+1)) + echo $out: ${files[@]} + $MERGE_INDEXES $CUTOFF ${files[@]} $out && rm ${files[@]} + if [ $eof -eq 1 -a ${#files[@]} -lt $MAXMERGE ]; then + mv $out ../merged + exit 0 + fi + done +) & +merge_pid=$! + +( + n=0 + while true; do + cur=$INDEXDIR/shard.$(printf '%05d' $n).index + n=$((n+1)) + next=$INDEXDIR/shard.$(printf '%05d' $n).index + eof=$INDEXDIR/EOF + while [ ! -e "$next" -a ! -e "$eof" ]; do + sleep 1 + done + echo "$cur" + mv "$cur" "$MERGEDIR/" + if [ ! -e "$next" -a -e "$eof" ]; then + mv "$eof" "$MERGEDIR/EOF" + exit 0 + fi + done +) & +copy_pid=$! + +trap "rm -rf '$TMPDIR'; kill 0; exit" INT TERM + +# consumes stdin +$MAKE_INDEX $INDEXDIR/shard +touch $INDEXDIR/EOF + +wait $copy_pid +wait $merge_pid +mv $TMPDIR/merged "$OUT" +rm -r $TMPDIR +exit 0 diff --git a/nutrimatic b/nutrimatic new file mode 160000 +Subproject 2159786044ac078cbb37cc5ba426837aa44d590 diff --git a/wikiextractor b/wikiextractor new file mode 160000 +Subproject 3162bb6c3c9ebd2d15be507aa11d6fa818a454a |