blob: 6c2712206a020207a63881a50b8542c85572f414 [file] [log] [blame]
#!/usr/bin/env bash
#
# american fuzzy lop++ - corpus minimization tool
# ---------------------------------------------
#
# Originally written by Michal Zalewski
#
# Copyright 2014, 2015 Google Inc. All rights reserved.
#
# Copyright 2019-2024 AFLplusplus
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# This tool tries to find the smallest subset of files in the input directory
# that still trigger the full range of instrumentation data points seen in
# the starting corpus. This has two uses:
#
# - Screening large corpora of input files before using them as a seed for
# afl-fuzz. The tool will remove functionally redundant files and likely
# leave you with a much smaller set.
#
# (In this case, you probably also want to consider running afl-tmin on
# the individual files later on to reduce their size.)
#
# - Minimizing the corpus generated organically by afl-fuzz, perhaps when
# planning to feed it to more resource-intensive tools. The tool achieves
# this by removing all entries that used to trigger unique behaviors in the
# past, but have been made obsolete by later finds.
#
# Note that the tool doesn't modify the files themselves. For that, you want
# afl-tmin.
#
# This script must use bash because other shells may have hardcoded limits on
# array sizes.
#
echo "corpus minimization tool for afl-fuzz"
echo
#########
# SETUP #
#########
# Process command-line options...
MEM_LIMIT=none
TIMEOUT=5000
unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN F_ARG \
AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE UNICORN_MODE T_ARG
export AFL_QUIET=1
while getopts "+i:o:f:m:t:T:eOQUAChXY" opt; do
case "$opt" in
"h")
;;
"i")
IN_DIR="$OPTARG"
;;
"o")
OUT_DIR="$OPTARG"
;;
"f")
STDIN_FILE="$OPTARG"
F_ARG=1
;;
"m")
MEM_LIMIT="$OPTARG"
MEM_LIMIT_GIVEN=1
;;
"t")
TIMEOUT="$OPTARG"
;;
"e")
EXTRA_PAR="$EXTRA_PAR -e"
;;
"A")
export AFL_CMIN_ALLOW_ANY=1
;;
"C")
export AFL_CMIN_CRASHES_ONLY=1
;;
"O")
EXTRA_PAR="$EXTRA_PAR -O"
FRIDA_MODE=1
;;
"Q")
EXTRA_PAR="$EXTRA_PAR -Q"
QEMU_MODE=1
;;
"Y")
EXTRA_PAR="$EXTRA_PAR -X"
NYX_MODE=1
;;
"X")
EXTRA_PAR="$EXTRA_PAR -X"
NYX_MODE=1
;;
"U")
EXTRA_PAR="$EXTRA_PAR -U"
UNICORN_MODE=1
;;
"T")
T_ARG="$OPTARG"
;;
"?")
exit 1
;;
esac
done
shift $((OPTIND-1))
TARGET_BIN="$1"
if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then
cat 1>&2 <<_EOF_
Usage: $0 [ options ] -- /path/to/target_app [ ... ]
Required parameters:
-i dir - input directory with the starting corpus
-o dir - output directory for minimized files
Execution control settings:
-T tasks - how many parallel processes to create (default=1, "all"=nproc)
-f file - location read by the fuzzed program (default: stdin)
-m megs - memory limit for child process (default=$MEM_LIMIT MB)
-t msec - run time limit for child process (default: 5000ms)
-O - use binary-only instrumentation (FRIDA mode)
-Q - use binary-only instrumentation (QEMU mode)
-U - use unicorn-based instrumentation (Unicorn mode)
-X - use Nyx mode
Minimization settings:
-A - allow crashing and timeout inputs
-C - keep crashing inputs, reject everything else
-e - solve for edge coverage only, ignore hit counts
For additional tips, please consult README.md.
Environment variables used:
AFL_KEEP_TRACES: leave the temporary <out_dir>\.traces directory
AFL_NO_FORKSRV: run target via execve instead of using the forkserver
AFL_PATH: last resort location to find the afl-showmap binary
AFL_SKIP_BIN_CHECK: skip check for target binary
AFL_CUSTOM_MUTATOR_LIBRARY: custom mutator library (post_process and send)
AFL_PYTHON_MODULE: custom mutator library (post_process and send)
_EOF_
exit 1
fi
# Do a sanity check to discourage the use of /tmp, since we can't really
# handle this safely from a shell script.
if [ "$AFL_ALLOW_TMP" = "" ]; then
echo "$IN_DIR" | grep -qE '^(/var)?/tmp/'
T1="$?"
echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/'
T2="$?"
echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/'
T3="$?"
echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/'
T4="$?"
echo "$PWD" | grep -qE '^(/var)?/tmp/'
T5="$?"
if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then
echo "[-] Warning: do not use this script in /tmp or /var/tmp for security reasons." 1>&2
fi
fi
# If @@ is specified, but there's no -f, let's come up with a temporary input
# file name.
TRACE_DIR="$OUT_DIR/.traces"
if [ "$STDIN_FILE" = "" ]; then
if echo "$*" | grep -qF '@@'; then
STDIN_FILE="$TRACE_DIR/.cur_input"
fi
fi
# Check for obvious errors.
if [ ! "$T_ARG" = "" -a -n "$F_ARG" -a ! "$NYX_MODE" == 1 ]; then
echo "[-] Error: -T and -f can not be used together." 1>&2
exit 1
fi
if [ ! "$MEM_LIMIT" = "none" ]; then
if [ "$MEM_LIMIT" -lt "5" ]; then
echo "[-] Error: dangerously low memory limit." 1>&2
exit 1
fi
fi
if [ ! "$TIMEOUT" = "none" ]; then
if [ "$TIMEOUT" -lt "10" ]; then
echo "[-] Error: dangerously low timeout." 1>&2
exit 1
fi
fi
if [ "$NYX_MODE" = "" ]; then
if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then
TNEW="`which "$TARGET_BIN" 2>/dev/null`"
if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then
echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2
exit 1
fi
TARGET_BIN="$TNEW"
fi
fi
grep -aq AFL_DUMP_MAP_SIZE "$TARGET_BIN" && {
echo "[!] Trying to obtain the map size of the target ..."
MAPSIZE=`AFL_DUMP_MAP_SIZE=1 "./$TARGET_BIN" 2>/dev/null`
test -n "$MAPSIZE" && {
export AFL_MAP_SIZE=$MAPSIZE
echo "[+] Setting AFL_MAP_SIZE=$MAPSIZE"
}
}
if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" -a "$FRIDA_MODE" = "" -a "$UNICORN_MODE" = "" -a "$NYX_MODE" = "" ]; then
if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then
echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2
exit 1
fi
fi
if [ ! -d "$IN_DIR" ]; then
echo "[-] Error: directory '$IN_DIR' not found." 1>&2
exit 1
fi
test -d "$IN_DIR/default" && IN_DIR="$IN_DIR/default"
test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue"
find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null
rm -rf "$TRACE_DIR" 2>/dev/null
rmdir "$OUT_DIR" 2>/dev/null
if [ -d "$OUT_DIR" ]; then
echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2
exit 1
fi
mkdir -m 700 -p "$TRACE_DIR" || exit 1
if [ ! "$STDIN_FILE" = "" ]; then
rm -f "$STDIN_FILE" || exit 1
touch "$STDIN_FILE" || exit 1
fi
SHOWMAP=`command -v afl-showmap 2>/dev/null`
if [ -z "$SHOWMAP" ]; then
TMP="${0%/afl-cmin.bash}/afl-showmap"
if [ -x "$TMP" ]; then
SHOWMAP=$TMP
fi
fi
if [ -z "$SHOWMAP" -a -x "./afl-showmap" ]; then
SHOWMAP="./afl-showmap"
else
if [ -n "$AFL_PATH" ]; then
SHOWMAP="$AFL_PATH/afl-showmap"
fi
fi
if [ ! -x "$SHOWMAP" ]; then
echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2
rm -rf "$TRACE_DIR"
exit 1
fi
THREADS=
if [ ! "$T_ARG" = "" ]; then
if [ "$T_ARG" = "all" ]; then
THREADS=$(nproc)
else
if [ "$T_ARG" -gt 1 -a "$T_ARG" -le "$(nproc)" ]; then
THREADS=$T_ARG
else
echo "[-] Error: -T parameter must between 2 and $(nproc) or \"all\"." 1>&2
fi
fi
else
if [ -z "$F_ARG" ]; then
echo "[*] Are you aware of the '-T all' parallelize option that massively improves the speed?"
fi
fi
IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`))
if [ "$IN_COUNT" = "0" ]; then
echo "[-] Hmm, no inputs in the target directory. Nothing to be done."
rm -rf "$TRACE_DIR"
exit 1
fi
echo "[*] Are you aware that afl-cmin is faster than this afl-cmin.bash script?"
echo "[+] Found $IN_COUNT files for minimizing."
if [ -n "$THREADS" ]; then
if [ "$IN_COUNT" -lt "$THREADS" ]; then
THREADS=$IN_COUNT
echo "[!] WARNING: less inputs than threads, reducing threads to $THREADS and likely the overhead of threading makes things slower..."
fi
fi
FIRST_FILE=`ls "$IN_DIR" | head -1`
# Make sure that we're not dealing with a directory.
if [ -d "$IN_DIR/$FIRST_FILE" ]; then
echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2
rm -rf "$TRACE_DIR"
exit 1
fi
# Check for the more efficient way to copy files...
if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then
CP_TOOL=ln
else
CP_TOOL=cp
fi
# Make sure that we can actually get anything out of afl-showmap before we
# waste too much time.
echo "[*] Testing the target binary..."
if [ "$STDIN_FILE" = "" ]; then
AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE"
else
cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE"
AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null
fi
FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`))
if [ "$FIRST_COUNT" -gt "0" ]; then
echo "[+] OK, $FIRST_COUNT tuples recorded."
else
echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)." 1>&2
test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
exit 1
fi
TMPFILE=$OUT_DIR/.list.$$
if [ ! "$THREADS" = "" ]; then
ls -- "$IN_DIR" > $TMPFILE 2>/dev/null
IN_COUNT=$(cat $TMPFILE | wc -l)
SPLIT=$(($IN_COUNT / $THREADS))
if [ "$(($IN_COUNT % $THREADS))" -gt 0 ]; then
SPLIT=$(($SPLIT + 1))
fi
echo "[+] Splitting workload into $THREADS tasks with $SPLIT items on average each."
split -l $SPLIT $TMPFILE $TMPFILE.
fi
# Let's roll!
#############################
# STEP 1: COLLECTING TRACES #
#############################
echo "[*] Obtaining traces for input files in '$IN_DIR'..."
if [ "$THREADS" = "" ]; then
(
CUR=0
if [ "$STDIN_FILE" = "" ]; then
ls "$IN_DIR" | while read -r fn; do
if [ -s "$IN_DIR/$fn" ]; then
CUR=$((CUR+1))
printf "\\r Processing file $CUR/$IN_COUNT... "
"$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn"
fi
done
else
ls "$IN_DIR" | while read -r fn; do
if [ -s "$IN_DIR/$fn" ]; then
CUR=$((CUR+1))
printf "\\r Processing file $CUR/$IN_COUNT... "
cp "$IN_DIR/$fn" "$STDIN_FILE"
"$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null
fi
done
fi
echo
)
else
PIDS=
CNT=0
for inputs in $(ls ${TMPFILE}.*); do
(
if [ "$STDIN_FILE" = "" ]; then
cat $inputs | while read -r fn; do
if [ -s "$IN_DIR/$fn" ]; then
"$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn"
fi
done
else
if [ -s "$IN_DIR/$fn" ]; then
STDIN_FILE="$inputs.$$"
cat $inputs | while read -r fn; do
cp "$IN_DIR/$fn" "$STDIN_FILE"
"$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null
done
fi
fi
) &
PIDS="$PIDS $!"
done
echo "[+] Waiting for running tasks IDs:$PIDS"
wait
echo "[+] all $THREADS running tasks completed."
rm -f ${TMPFILE}*
#echo trace dir files: $(ls $TRACE_DIR/*|wc -l)
fi
##########################
# STEP 2: SORTING TUPLES #
##########################
# With this out of the way, we sort all tuples by popularity across all
# datasets. The reasoning here is that we won't be able to avoid the files
# that trigger unique tuples anyway, so we will want to start with them and
# see what's left.
echo "[*] Sorting trace sets (this may take a while)..."
ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \
sort | uniq -c | sort -k 1,1 -n >"$TRACE_DIR/.all_uniq"
TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`))
echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files."
#####################################
# STEP 3: SELECTING CANDIDATE FILES #
#####################################
# The next step is to find the best candidate for each tuple. The "best"
# part is understood simply as the smallest input that includes a particular
# tuple in its trace. Empirical evidence suggests that this produces smaller
# datasets than more involved algorithms that could be still pulled off in
# a shell script.
echo "[*] Finding best candidates for each tuple..."
CUR=0
ls -rS "$IN_DIR" | while read -r fn; do
CUR=$((CUR+1))
printf "\\r Processing file $CUR/$IN_COUNT... "
sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list"
test -s "$TRACE_DIR/$fn" || echo Warning: $fn is ignored because of crashing the target
done
echo
##############################
# STEP 4: LOADING CANDIDATES #
##############################
# At this point, we have a file of tuple-file pairs, sorted by file size
# in ascending order (as a consequence of ls -rS). By doing sort keyed
# only by tuple (-k 1,1) and configured to output only the first line for
# every key (-s -u), we end up with the smallest file for each tuple.
echo "[*] Sorting candidate list (be patient)..."
sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \
sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script"
if [ ! -s "$TRACE_DIR/.candidate_script" ]; then
echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2
test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
exit 1
fi
# The sed command converted the sorted list to a shell script that populates
# BEST_FILE[tuple]="fname". Let's load that!
. "$TRACE_DIR/.candidate_script"
##########################
# STEP 5: WRITING OUTPUT #
##########################
# The final trick is to grab the top pick for each tuple, unless said tuple is
# already set due to the inclusion of an earlier candidate; and then put all
# tuples associated with the newly-added file to the "already have" list. The
# loop works from least popular tuples and toward the most common ones.
echo "[*] Processing candidates and writing output files..."
CUR=0
touch "$TRACE_DIR/.already_have"
while read -r cnt tuple; do
CUR=$((CUR+1))
printf "\\r Processing tuple $CUR/$TUPLE_COUNT with count $cnt... "
# If we already have this tuple, skip it.
grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue
FN=${BEST_FILE[tuple]}
# echo "tuple nr $CUR ($tuple cnt=$cnt) -> $FN" >> "$TRACE_DIR/.log"
$CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN"
if [ "$((CUR % 5))" = "0" ]; then
sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp"
mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have"
else
cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have"
fi
done <"$TRACE_DIR/.all_uniq"
echo
OUT_COUNT=`ls -- "$OUT_DIR" | wc -l`
if [ "$OUT_COUNT" = "1" ]; then
echo "[!] WARNING: All test cases had the same traces, check syntax!"
fi
echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'."
echo
test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
exit 0