summaryrefslogtreecommitdiff
path: root/merge.sh
diff options
context:
space:
mode:
Diffstat (limited to 'merge.sh')
-rwxr-xr-xmerge.sh75
1 files changed, 75 insertions, 0 deletions
diff --git a/merge.sh b/merge.sh
new file mode 100755
index 0000000..6f764de
--- /dev/null
+++ b/merge.sh
@@ -0,0 +1,75 @@
+#!/bin/bash -e
+if [ $# != 1 ]; then
+ echo >&2 "usage: $0 [outfile]"
+ exit 1
+fi
+
+OUT="$1"
+MERGE_INDEXES=$(realpath nutrimatic/bin/merge-indexes)
+MAKE_INDEX=$(realpath nutrimatic/bin/make-index)
+
+# merge-indexes FDs: stdin, stdout, stderr, [in...], out
+# fatal assumption: we'll never exceed MAX_ARG
+MAXMERGE=$(($(ulimit -n)-4))
+CUTOFF=1
+TMPDIR=$(mktemp -d mergetmp.XXXXX)
+MERGEDIR=$TMPDIR/merge
+INDEXDIR=$TMPDIR/index
+mkdir -p $MERGEDIR $INDEXDIR
+
+(
+ cd $MERGEDIR
+ m=0
+ while true; do
+ eof=0
+ if [ -e EOF ]; then
+ eof=1
+ fi
+ files=($(shopt -s nullglob; ls -S *.index *.merge | tail -n $MAXMERGE))
+ if [ $eof -eq 0 -a ${#files[@]} -lt 2 ]; then
+ sleep 1
+ continue
+ fi
+ out=shard.$(printf '%05d' $m).merge
+ m=$((m+1))
+ echo $out: ${files[@]}
+ $MERGE_INDEXES $CUTOFF ${files[@]} $out && rm ${files[@]}
+ if [ $eof -eq 1 -a ${#files[@]} -lt $MAXMERGE ]; then
+ mv $out ../merged
+ exit 0
+ fi
+ done
+) &
+merge_pid=$!
+
+(
+ n=0
+ while true; do
+ cur=$INDEXDIR/shard.$(printf '%05d' $n).index
+ n=$((n+1))
+ next=$INDEXDIR/shard.$(printf '%05d' $n).index
+ eof=$INDEXDIR/EOF
+ while [ ! -e "$next" -a ! -e "$eof" ]; do
+ sleep 1
+ done
+ echo "$cur"
+ mv "$cur" "$MERGEDIR/"
+ if [ ! -e "$next" -a -e "$eof" ]; then
+ mv "$eof" "$MERGEDIR/EOF"
+ exit 0
+ fi
+ done
+) &
+copy_pid=$!
+
+trap "rm -rf '$TMPDIR'; kill 0; exit" INT TERM
+
+# consumes stdin
+$MAKE_INDEX $INDEXDIR/shard
+touch $INDEXDIR/EOF
+
+wait $copy_pid
+wait $merge_pid
+mv $TMPDIR/merged "$OUT"
+rm -r $TMPDIR
+exit 0