From 6a288a6338df748cce5ba1005dfe75a28b055579 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Fri, 25 Sep 2020 17:45:45 +0000
Subject: [PATCH] Use grep instead, which is faster but uses more memory

---
 dedupe | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/dedupe b/dedupe
index 855ae5e..3315f0a 100755
--- a/dedupe
+++ b/dedupe
@@ -3,13 +3,15 @@ function usage_exit {
 	echo 'Usage: dedupe FILE1 FILE2' >&2
 	echo >&2
 	echo 'Prints all lines from FILE2 that do not appear in FILE1, in the order of FILE2.' >&2
-	echo 'WARNING: FILE1 has to be read into memory fully. If your files are sorted, use comm instead.' >&2
+	echo "WARNING: FILE1 has to be read into memory fully, and memory use scales with about a factor 40 of FILE1's size. If your files are sorted, use comm instead." >&2
 	exit $1
 }
 
 if [[ "$1" == '-h' || "$1" == '--help' ]]; then usage_exit 0; fi
 if [[ $# -ne 2 ]]; then usage_exit 1; fi
 
-# Perl seems to be ~30 % faster for this.
+# Perl seems to be ~30 % faster than AWK for this, but grep is ~2-3 times faster than Perl.
+# AWK uses the least memory, Perl about 1.5 times as much, grep twice as much (as AWK).
 #awk 'NR==FNR { s[$0]=1; next; }  !($0 in s)' "$1" "$2"
-perl -ne 'if (@ARGV == 1) { $seen{$_}=1; } else { print $_ if !(exists $seen{$_}); }' "$1" "$2"
+#perl -ne 'if (@ARGV == 1) { $seen{$_}=1; } else { print $_ if !(exists $seen{$_}); }' "$1" "$2"
+grep -F -x -v -f "$1" "$2"