summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Wallace <kevin@wallace.seattle.wa.us>2019-05-09 00:16:02 -0700
committerKevin Wallace <kevin@wallace.seattle.wa.us>2019-05-09 00:18:46 -0700
commit2784977e4613260a3e40a1e6de412ce276e50467 (patch)
tree9d5adc0b75996943803d8c594361de70fb8efe5b
Initial commit
-rw-r--r--.gitmodules6
-rw-r--r--.htaccess1
-rw-r--r--Makefile58
-rw-r--r--index.php145
-rwxr-xr-xmerge.sh75
m---------nutrimatic0
m---------wikiextractor0
7 files changed, 285 insertions, 0 deletions
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..eb4e7ca
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "nutrimatic"]
+ path = nutrimatic
+ url = https://github.com/egnor/nutrimatic
+[submodule "wikiextractor"]
+ path = wikiextractor
+ url = https://github.com/attardi/wikiextractor
diff --git a/.htaccess b/.htaccess
new file mode 100644
index 0000000..ed19066
--- /dev/null
+++ b/.htaccess
@@ -0,0 +1 @@
+RedirectMatch 404 /\.git
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..3c95593
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,58 @@
+PATH := $(PATH):/usr/local/bin
+
+.PHONY: all
+all: enwiktionary-latest.version
+all: enwiktionary-latest.index
+all: enwiki-latest.version
+all: enwiki-latest.index
+
+.PHONY: newest
+newest:
+ rm -f *.version && make
+
+.PHONY: reindex
+reindex:
+ rm -f *.index && make
+
+.PHONY: sync
+sync: newest
+ rsync -av *.index tilde:public_html/pzzl.org/nut/
+
+.PHONY: trim
+trim:
+ ls enwiki-*-snap.index | sort | head -n -2 | xargs rm
+ ls enwiktionary-*-snap.index | sort | head -n -2 | xargs rm
+
+%/.git:
+ git submodule update --init $*
+
+nutrimatic/bin: nutrimatic/.git
+ cd nutrimatic && ./build.py
+
+wikiextractor/WikiExtractor.py: wikiextractor/.git
+
+%-latest.version:
+ wget -O - https://dumps.wikimedia.org/$*/latest/$*-latest-pages-articles.xml.bz2-rss.xml |\
+ xpath '//rss/channel/link/text()' |\
+ sed 's/^.*\///' > $@
+
+enwiki-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py
+ # ~7h30m
+ wget -O - https://dumps.wikimedia.org/enwiki/$*/enwiki-$*-pages-articles.xml.bz2 |\
+ bzip2 -d |\
+ python wikiextractor/WikiExtractor.py -q -o- - |\
+ ./merge.sh $@
+
+enwiktionary-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py
+ # ~1h
+ wget -O - https://dumps.wikimedia.org/enwiktionary/$*/enwiktionary-$*-pages-articles.xml.bz2 |\
+ bzip2 -d |\
+ python wikiextractor/WikiExtractor.py -q -o- - |\
+ ./merge.sh $@
+
+%-snap.index: %-snap.merge nutrimatic/bin
+ nutrimatic/bin/merge-indexes 5 $*-snap.merge $@
+
+%-latest.index: %-latest.version
+ make $*-$(shell cat $*-latest.version)-snap.index
+ ln -sf $*-$(shell cat $*-latest.version)-snap.index $@
diff --git a/index.php b/index.php
new file mode 100644
index 0000000..5ac793f
--- /dev/null
+++ b/index.php
@@ -0,0 +1,145 @@
+<?php
+
+// find_expr runs nutrimatic's find-expr, calling $match_fn for each match.
+// if $match_fn returns a non-false value, the search will stop
+function find_expr($index, $expr, $match_fn, $error_fn) {
+ $cmd = implode(' ', array_map('escapeshellarg', [
+ "./nutrimatic/bin/find-expr",
+ $index,
+ $expr,
+ ]));
+ $cmd = "ulimit -t 30; exec $cmd";
+ $descriptorspec = [
+ 0 => ['pipe', 'r'],
+ 1 => ['pipe', 'w'],
+ 2 => ['pipe', 'w'],
+ ];
+ $proc = proc_open($cmd, $descriptorspec, $pipes);
+ list($stdin, $stdout, $stderr) = $pipes;
+ fclose($stdin);
+ // assumption: anything written to stderr will fit in the pipe buffer
+ // nutrimatic only appears to write-then-exit, so It's Probably Fine
+ while ($line = fscanf($stdout, "%s %[^\n]s\n")) {
+ list($score, $text) = $line;
+ if ($match_fn($score, $text)) {
+ break;
+ }
+ }
+ fclose($stdout);
+ proc_terminate($proc);
+ while ($err = fgets($stderr)) {
+ $error_fn($err);
+ }
+ fclose($stderr);
+ $retval = proc_close($proc);
+ return $retval;
+}
+
+function index_name($file) {
+ return preg_replace('/\.index$/', '', $file);
+}
+
+$index_files = glob("*.index");
+$default_index = "enwiki-latest.index";
+$selected_index = $default_index;
+foreach ($index_files as $file) {
+ if (index_name($file) == $_GET['idx']) {
+ $selected_index = $file;
+ }
+}
+$q = $_GET['q'] ?: "";
+$more = 1;
+$autofocus = ' autofocus';
+if ($_GET['more']) {
+ $more = (int)($_GET['more']);
+ $autofocus = '';
+}
+$max_more = 10;
+
+?>
+<html>
+<head>
+ <meta name="viewport" content="initial-scale=1.0">
+ <title><?php
+ if ($q) {
+ echo htmlspecialchars($q);
+ echo ' &raquo; ';
+ }
+ if ($selected_index != $default_index) {
+ echo htmlspecialchars(index_name($selected_index));
+ echo ' ';
+ }
+ echo "nut";
+ ?></title>
+<style>
+
+</style>
+</head>
+<body>
+ <form>
+ <fieldset>
+ <legend><a href=".">nut</a></legend>
+ <select name="idx">
+ <?php
+ foreach ($index_files as $file) {
+ $name = index_name($file);
+ $attrs = '';
+ if ($file == $selected_index) {
+ $attrs .= ' selected';
+ }
+ ?>
+ <option<?= $attrs ?> value="<?=htmlspecialchars($name)?>"><?=htmlspecialchars($name)?></option>
+ <?php
+ }
+ ?>
+ </select>
+ <input type="text" name="q" value="<?=htmlspecialchars($q)?>" <?=$autofocus?>/>
+ <input type="submit" />
+ </fieldset>
+ </form>
+ <?php
+ if ($more > $max_more) {
+ echo "no";
+ } else if ($q) {
+ ?>
+ <ol>
+ <?php
+ $depth = 0;
+ $anchor = '';
+ find_expr($selected_index, $q, function($score, $text) use (&$depth, &$anchor, $more) {
+ if ($score == '#') {
+ $depth++;
+ if ($depth == $more) {
+ $anchor .= ' id="more"';
+ }
+ return $depth > $more;
+ }
+ ?>
+ <li class="match" value="<?=(int)(float)($score)?>"<?=$anchor?>><?=htmlspecialchars($text)?></li>
+ <?php
+ $anchor = '';
+ }, function($err) {
+ ?>
+ <li class="error" value="0"><?=htmlspecialchars($err)?></li>
+ <?php
+ });
+ ?>
+ </ol>
+ <?php
+ if ($depth > $more && $more != $max_more) {
+ ?>
+ <a<?=$anchor?> class="more" href="?<?=http_build_query([
+ 'idx' => index_name($selected_index),
+ 'q' => $q,
+ 'more' => $depth,
+ ])?>#more">MORE</a>
+ <?php
+ }
+ } else {
+ ?>
+ <p>Hi I'm a <a href="//nutrimatic.org/">Nutrimatic</a></p>
+ <?php
+ }
+ ?>
+</body>
+</html>
diff --git a/merge.sh b/merge.sh
new file mode 100755
index 0000000..6f764de
--- /dev/null
+++ b/merge.sh
@@ -0,0 +1,75 @@
+#!/bin/bash -e
+if [ $# != 1 ]; then
+ echo >&2 "usage: $0 [outfile]"
+ exit 1
+fi
+
+OUT="$1"
+MERGE_INDEXES=$(realpath nutrimatic/bin/merge-indexes)
+MAKE_INDEX=$(realpath nutrimatic/bin/make-index)
+
+# merge-indexes FDs: stdin, stdout, stderr, [in...], out
+# fatal assumption: we'll never exceed MAX_ARG
+MAXMERGE=$(($(ulimit -n)-4))
+CUTOFF=1
+TMPDIR=$(mktemp -d mergetmp.XXXXX)
+MERGEDIR=$TMPDIR/merge
+INDEXDIR=$TMPDIR/index
+mkdir -p $MERGEDIR $INDEXDIR
+
+(
+ cd $MERGEDIR
+ m=0
+ while true; do
+ eof=0
+ if [ -e EOF ]; then
+ eof=1
+ fi
+ files=($(shopt -s nullglob; ls -S *.index *.merge | tail -n $MAXMERGE))
+ if [ $eof -eq 0 -a ${#files[@]} -lt 2 ]; then
+ sleep 1
+ continue
+ fi
+ out=shard.$(printf '%05d' $m).merge
+ m=$((m+1))
+ echo $out: ${files[@]}
+ $MERGE_INDEXES $CUTOFF ${files[@]} $out && rm ${files[@]}
+ if [ $eof -eq 1 -a ${#files[@]} -lt $MAXMERGE ]; then
+ mv $out ../merged
+ exit 0
+ fi
+ done
+) &
+merge_pid=$!
+
+(
+ n=0
+ while true; do
+ cur=$INDEXDIR/shard.$(printf '%05d' $n).index
+ n=$((n+1))
+ next=$INDEXDIR/shard.$(printf '%05d' $n).index
+ eof=$INDEXDIR/EOF
+ while [ ! -e "$next" -a ! -e "$eof" ]; do
+ sleep 1
+ done
+ echo "$cur"
+ mv "$cur" "$MERGEDIR/"
+ if [ ! -e "$next" -a -e "$eof" ]; then
+ mv "$eof" "$MERGEDIR/EOF"
+ exit 0
+ fi
+ done
+) &
+copy_pid=$!
+
+trap "rm -rf '$TMPDIR'; kill 0; exit" INT TERM
+
+# consumes stdin
+$MAKE_INDEX $INDEXDIR/shard
+touch $INDEXDIR/EOF
+
+wait $copy_pid
+wait $merge_pid
+mv $TMPDIR/merged "$OUT"
+rm -r $TMPDIR
+exit 0
diff --git a/nutrimatic b/nutrimatic
new file mode 160000
+Subproject 2159786044ac078cbb37cc5ba426837aa44d590
diff --git a/wikiextractor b/wikiextractor
new file mode 160000
+Subproject 3162bb6c3c9ebd2d15be507aa11d6fa818a454a