hp-dial/preprocessing/clean.sh

51 lines
1.5 KiB
Bash
Raw Normal View History

#!/usr/bin/env sh
temp="/tmp/clean-hp-tempXXXXXXXXXX"
tempname=$(mktemp "$temp")
trap 'rm "$temp"' INT TERM QUIT
quote_removal() {
previous_line=""
merge=""
while read -r line; do
if echo "$previous_line" | grep -q '^"'; then
if echo "$line" | grep -q '^"'; then
merge="$previous_line $(echo "$line" | cut -c 2-)"
else
merge="$previous_line $line"
fi
fi
if echo "$merge" | grep -Eq '^"[^"]+"[\.!]?$'; then
echo "$merge"
previous_line=""
else
if echo "$line" | grep -q '^"'; then
if [ -n "$merge" ]; then
previous_line="$merge"
elif echo "$line" | grep -Eq '"[\.!]?$'; then
echo "$line" | sed -E 's|(")([\.!])$|\2|'
previous_line=""
else
previous_line="$line"
fi
elif echo "$line" | grep -Eq '"[\.!]?$'; then
echo "$merge"
previous_line=""
else
[ -n "$previous_line" ] && echo "$previous_line"
echo "$line"
previous_line=""
fi
fi
merge=""
done < "${1:-/dev/stdin}"
}
sed -E 's|^--?||;s|^"?||;s|—||;s|―||;s|^_||;s|^ ||' < "${1:-/dev/stdin}" |
uniq | quote_removal | sed -E 's|^"+ ?||' > "$tempname"
sh until-no-change.sh "sed '/,$/ {N; s/\n/ /g;}' < '$tempname' | uniq" \
'sed "/,$/ {N; s/\n/ /g;}" | uniq'
rm "$tempname"