Initial commit

author: Kevin Wallace <kevin@wallace.seattle.wa.us> 2019-05-09 00:16:02 -0700
committer: Kevin Wallace <kevin@wallace.seattle.wa.us> 2019-05-09 00:18:46 -0700
commit: 2784977e4613260a3e40a1e6de412ce276e50467 (patch)
tree: 9d5adc0b75996943803d8c594361de70fb8efe5b
7 files changed, 285 insertions, 0 deletions
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..eb4e7ca
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "nutrimatic"]
+	path = nutrimatic
+	url = https://github.com/egnor/nutrimatic
+[submodule "wikiextractor"]
+	path = wikiextractor
+	url = https://github.com/attardi/wikiextractor
diff --git a/.htaccess b/.htaccess
new file mode 100644
index 0000000..ed19066
--- /dev/null
+++ b/.htaccess
@@ -0,0 +1 @@
+RedirectMatch 404 /\.git
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..3c95593
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,58 @@
+PATH := $(PATH):/usr/local/bin
+
+.PHONY: all
+all: enwiktionary-latest.version
+all: enwiktionary-latest.index
+all: enwiki-latest.version
+all: enwiki-latest.index
+
+.PHONY: newest
+newest:
+	rm -f *.version && make
+
+.PHONY: reindex
+reindex:
+	rm -f *.index && make
+
+.PHONY: sync
+sync: newest
+	rsync -av *.index tilde:public_html/pzzl.org/nut/
+
+.PHONY: trim
+trim:
+	ls enwiki-*-snap.index | sort | head -n -2 | xargs rm
+	ls enwiktionary-*-snap.index | sort | head -n -2 | xargs rm
+
+%/.git:
+	git submodule update --init $*
+
+nutrimatic/bin: nutrimatic/.git
+	cd nutrimatic && ./build.py
+
+wikiextractor/WikiExtractor.py: wikiextractor/.git
+
+%-latest.version:
+	wget -O - https://dumps.wikimedia.org/$*/latest/$*-latest-pages-articles.xml.bz2-rss.xml |\
+		xpath '//rss/channel/link/text()' |\
+		sed 's/^.*\///' > $@
+
+enwiki-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py
+	# ~7h30m
+	wget -O - https://dumps.wikimedia.org/enwiki/$*/enwiki-$*-pages-articles.xml.bz2 |\
+		bzip2 -d |\
+		python wikiextractor/WikiExtractor.py -q -o- - |\
+		./merge.sh $@
+
+enwiktionary-%-snap.merge: nutrimatic/bin wikiextractor/WikiExtractor.py
+	# ~1h
+	wget -O - https://dumps.wikimedia.org/enwiktionary/$*/enwiktionary-$*-pages-articles.xml.bz2 |\
+		bzip2 -d |\
+		python wikiextractor/WikiExtractor.py -q -o- - |\
+		./merge.sh $@
+
+%-snap.index: %-snap.merge nutrimatic/bin
+	nutrimatic/bin/merge-indexes 5 $*-snap.merge $@
+
+%-latest.index: %-latest.version
+	make $*-$(shell cat $*-latest.version)-snap.index
+	ln -sf $*-$(shell cat $*-latest.version)-snap.index $@
diff --git a/index.php b/index.php
new file mode 100644
index 0000000..5ac793f
--- /dev/null
+++ b/index.php
@@ -0,0 +1,145 @@
+<?php
+
+// find_expr runs nutrimatic's find-expr, calling $match_fn for each match.
+// if $match_fn returns a non-false value, the search will stop
+function find_expr($index, $expr, $match_fn, $error_fn) {
+	$cmd = implode(' ', array_map('escapeshellarg', [
+		"./nutrimatic/bin/find-expr",
+		$index,
+		$expr,
+	]));
+	$cmd = "ulimit -t 30; exec $cmd";
+	$descriptorspec = [
+		0 => ['pipe', 'r'],
+		1 => ['pipe', 'w'],
+		2 => ['pipe', 'w'],
+	];
+	$proc = proc_open($cmd, $descriptorspec, $pipes);
+	list($stdin, $stdout, $stderr) = $pipes;
+	fclose($stdin);
+	// assumption: anything written to stderr will fit in the pipe buffer
+	// nutrimatic only appears to write-then-exit, so It's Probably Fine
+	while ($line = fscanf($stdout, "%s %[^\n]s\n")) {
+		list($score, $text) = $line;
+		if ($match_fn($score, $text)) {
+			break;
+		}
+	}
+	fclose($stdout);
+	proc_terminate($proc);
+	while ($err = fgets($stderr)) {
+		$error_fn($err);
+	}
+	fclose($stderr);
+	$retval = proc_close($proc);
+	return $retval;
+}
+
+function index_name($file) {
+	return preg_replace('/\.index$/', '', $file);
+}
+
+$index_files = glob("*.index");
+$default_index = "enwiki-latest.index";
+$selected_index = $default_index;
+foreach ($index_files as $file) {
+	if (index_name($file) == $_GET['idx']) {
+		$selected_index = $file;
+	}
+}
+$q = $_GET['q'] ?: "";
+$more = 1;
+$autofocus = ' autofocus';
+if ($_GET['more']) {
+	$more = (int)($_GET['more']);
+	$autofocus = '';
+}
+$max_more = 10;
+
+?>
+<html>
+<head>
+	<meta name="viewport" content="initial-scale=1.0">
+	<title><?php
+	if ($q) {
+		echo htmlspecialchars($q);
+		echo ' &raquo; ';
+	}
+	if ($selected_index != $default_index) {
+		echo htmlspecialchars(index_name($selected_index));
+		echo ' ';
+	}
+	echo "nut";
+	?></title>
+<style>
+
+</style>
+</head>
+<body>
+	<form>
+		<fieldset>
+			<legend><a href=".">nut</a></legend>
+			<select name="idx">
+				<?php
+				foreach ($index_files as $file) {
+					$name = index_name($file);
+					$attrs = '';
+					if ($file == $selected_index) {
+						$attrs .= ' selected';
+					}
+					?>
+					<option<?= $attrs ?> value="<?=htmlspecialchars($name)?>"><?=htmlspecialchars($name)?></option>
+					<?php
+				}
+				?>
+			</select>
+			<input type="text" name="q" value="<?=htmlspecialchars($q)?>" <?=$autofocus?>/>
+			<input type="submit" />
+		</fieldset>
+	</form>
+	<?php
+	if ($more > $max_more) {
+		echo "no";
+	} else if ($q) {
+		?>
+		<ol>
+			<?php
+			$depth = 0;
+			$anchor = '';
+			find_expr($selected_index, $q, function($score, $text) use (&$depth, &$anchor, $more) {
+				if ($score == '#') {
+					$depth++;
+					if ($depth == $more) {
+						$anchor .= ' id="more"';
+					}
+					return $depth > $more;
+				}
+				?>
+				<li class="match" value="<?=(int)(float)($score)?>"<?=$anchor?>><?=htmlspecialchars($text)?></li>
+				<?php
+				$anchor = '';
+			}, function($err) {
+				?>
+				<li class="error" value="0"><?=htmlspecialchars($err)?></li>
+				<?php
+			});
+			?>
+		</ol>
+		<?php
+		if ($depth > $more && $more != $max_more) {
+			?>
+			<a<?=$anchor?> class="more" href="?<?=http_build_query([
+				'idx' => index_name($selected_index),
+				'q' => $q,
+				'more' => $depth,
+			])?>#more">MORE</a>
+			<?php
+		}
+	} else {
+		?>
+		<p>Hi I'm a <a href="//nutrimatic.org/">Nutrimatic</a></p>
+		<?php
+	}
+	?>
+</body>
+</html>
diff --git a/merge.sh b/merge.sh
new file mode 100755
index 0000000..6f764de
--- /dev/null
+++ b/merge.sh
@@ -0,0 +1,75 @@
+#!/bin/bash -e
+if [ $# != 1 ]; then
+	echo >&2 "usage: $0 [outfile]"
+	exit 1
+fi
+
+OUT="$1"
+MERGE_INDEXES=$(realpath nutrimatic/bin/merge-indexes)
+MAKE_INDEX=$(realpath nutrimatic/bin/make-index)
+
+# merge-indexes FDs: stdin, stdout, stderr, [in...], out
+# fatal assumption: we'll never exceed MAX_ARG
+MAXMERGE=$(($(ulimit -n)-4))
+CUTOFF=1
+TMPDIR=$(mktemp -d mergetmp.XXXXX)
+MERGEDIR=$TMPDIR/merge
+INDEXDIR=$TMPDIR/index
+mkdir -p $MERGEDIR $INDEXDIR
+
+(
+	cd $MERGEDIR
+	m=0
+	while true; do
+		eof=0
+		if [ -e EOF ]; then
+			eof=1
+		fi
+		files=($(shopt -s nullglob; ls -S *.index *.merge | tail -n $MAXMERGE))
+		if [ $eof -eq 0 -a ${#files[@]} -lt 2 ]; then
+			sleep 1
+			continue
+		fi
+		out=shard.$(printf '%05d' $m).merge
+		m=$((m+1))
+		echo $out: ${files[@]}
+		$MERGE_INDEXES $CUTOFF ${files[@]} $out && rm ${files[@]}
+		if [ $eof -eq 1 -a ${#files[@]} -lt $MAXMERGE ]; then
+			mv $out ../merged
+			exit 0
+		fi
+	done
+) &
+merge_pid=$!
+
+(
+	n=0
+	while true; do
+		cur=$INDEXDIR/shard.$(printf '%05d' $n).index
+		n=$((n+1))
+		next=$INDEXDIR/shard.$(printf '%05d' $n).index
+		eof=$INDEXDIR/EOF
+		while [ ! -e "$next" -a ! -e "$eof" ]; do
+			sleep 1
+		done
+		echo "$cur"
+		mv "$cur" "$MERGEDIR/"
+		if [ ! -e "$next" -a -e "$eof" ]; then
+			mv "$eof" "$MERGEDIR/EOF"
+			exit 0
+		fi
+	done
+) &
+copy_pid=$!
+
+trap "rm -rf '$TMPDIR'; kill 0; exit" INT TERM
+
+# consumes stdin
+$MAKE_INDEX $INDEXDIR/shard
+touch $INDEXDIR/EOF
+
+wait $copy_pid
+wait $merge_pid
+mv $TMPDIR/merged "$OUT"
+rm -r $TMPDIR
+exit 0
diff --git a/nutrimatic b/nutrimatic
new file mode 160000
+Subproject 2159786044ac078cbb37cc5ba426837aa44d590
diff --git a/wikiextractor b/wikiextractor
new file mode 160000
+Subproject 3162bb6c3c9ebd2d15be507aa11d6fa818a454a
author	Kevin Wallace <kevin@wallace.seattle.wa.us>	2019-05-09 00:16:02 -0700
committer	Kevin Wallace <kevin@wallace.seattle.wa.us>	2019-05-09 00:18:46 -0700
commit	2784977e4613260a3e40a1e6de412ce276e50467 (patch)
tree	9d5adc0b75996943803d8c594361de70fb8efe5b