alchi-journal/scan-adf.sh

548 lines
14 KiB
Bash
Executable file

#! /usr/bin/env bash
# scan-adf.sh
# single pass duplex scanning of multiple pages (ADF)
# with Brother ADS-3000N scanner
this_user_uid=$(id --user)
this_user_gid=$(id --group)
output_user_uid=1000
output_user_gid=100
tempdir="/run/user/$output_user_uid"
keep_tempfile=true # debug
keep_tempfile=false
write_logfile=true # debug
#write_logfile=false
do_chown=false
if ((output_user_uid != this_user_uid)) || ((output_user_gid != this_user_gid)); then
do_chown=true
fi
# TODO dynamic. use "lsusb" to find the scanner device
# sudo scanimage -L
# $ sudo scanimage -L
# device `brother5:bus1;dev3' is a Brother ADS-3000N USB scanner
# note: "dev3" does not correspond with output of lsusb
# $ lsusb | grep ADS-3000N
# Bus 001 Device 073: ID 04f9:03b8 Brother Industries, Ltd ADS-3000N
# $ sudo scanimage -L
# device `brother4:bus4;dev1' is a Brother ADS-3000N USB scanner
# device `brother5:bus1;dev4' is a Brother ADS-3000N USB scanner
device_name="brother5:bus1;dev3"
device_name="brother5:bus1;dev4"
device_name="brother5:bus2;dev2" # Bus 002 Device 020: ID 04f9:03b8 Brother Industries, Ltd ADS-3000N
device_name="$1"
shift
if [ -z "$device_name" ]; then
echo "error: missing argument: device_name" >&2
echo "example use: $0 brother5:bus2;dev2" >&2
echo "hint: use this to get the device name: scanimage -L" >&2
exit 1
fi
# sudo scanimage --device-name="$device_name" --help
#source="Flatbed"
source="Automatic Document Feeder(left aligned,Duplex)"
# 24bit Color[Fast]
# Black & White
# True Gray
# Gray[Error Diffusion]
mode="24bit Color[Fast]"
# see benchmark.txt
# pnm and tiff are fastest and best quality
# png is much slower
format=pnm
# din a4: 210 x 297 mm
extra_options=(--MultifeedDetection=yes --SkipBlankPage=no -x 210 -y 297)
quality=80
small_scale=50%
# 15 MByte png file
resolution=300
if [[ "$(id -u)" != "0" ]]; then
echo "error: you must run this script as root. hint: sudo $0"
exit 1
fi
if false; then
# scan about 10 white sheets to test the scan quality
# if there are vertical grey lines then clean the sensor glass with water or acetone
# see also: done-vertical-lines-from-adf-scanner.txt
sudo scanimage --device-name=brother5:bus1\;dev3 --resolution=300 --format=pnm \
--batch=/run/user/1000/scan-calibration.%d.pnm --batch-print --mode=24bit\ Color\[Fast\] \
--source=Automatic\ Document\ Feeder\(left\ aligned\,Duplex\) --MultifeedDetection=yes \
--SkipBlankPage=no -x 210 -y 297
d1=scan-calibration.$(date -Is --utc);
for pnm in /run/user/1000/scan-calibration.*.pnm; do
n=${pnm%.*}; n=${n##*.}; np=$(printf "%03d\n" $n);
d2=even; if ((n % 2 == 1)); then d2=odd; fi;
png=$d1/$d2/$np.png;
mkdir -p $(dirname $png);
echo writing $png;
convert $pnm $png;
done
fi
# https://imagemagick.org/script/webp.php
# these produce large output:
# -define webp:alpha-compression=0
# -define webp:exact=true
# thresholds can produce ugly transparent output. example: scan.2023-10-03.10-31-42.1.webp
# -black-threshold $bth% -white-threshold $wth%
# -black-threshold "${lowthresh}%" -white-threshold "${highthresh}%"
# level should be enough:
# -level ${lowthresh}x${highthresh}%
# contrast: increase contrast to remove noise in document scans
# https://superuser.com/questions/622950/is-there-a-way-to-increase-the-contrast-of-a-pdf-that-was-created-by-scanning-a
# http://www.fmwconcepts.com/imagemagick/thresholds/index.php # -t soft -l 25 -h 75
# to find these threshold values, use gimp > colors > levels
#lowthresh=15 # text is too light
lowthresh=40 # produce dark text # 40/100 = 100/256
# highthresh:
# lower = more white, less artefacts, more loss of grey lines
#highthresh=80
# 98 is better than 100
# to convert a slightly grey background to a pure white background
highthresh=98
# i need to go this low, to remove vertical grey lines produced by my ADF scanner
# see also https://github.com/ImageMagick/ImageMagick/discussions/6042
#highthresh=85
# i really need to go THIS low to remove all grey lines on all pages. oof!
# this is lossy, because my hand-written text also contains grey lines
#highthresh=66 # 66/100 = 170/256
# set profile to fix red tint (red color cast)
# https://blog.teamgeist-medien.de/2015/07/typo3-graphicsmagick-rotstich-bei-bildern-beheben-farbfehler.html
# https://legacy.imagemagick.org/discourse-server/viewtopic.php?t=22549
#large_convert_options+=( -set colorspace RGB +profile '*' )
# my document scanner adds a white bar below the scanned image. remove it by cropping
# input size: 2480x3508
crop_x=2480; crop_y=3342 # resolution=300
shared_convert_options=(
"${extra_convert_options[@]}"
-set colorspace RGB
+profile '*'
-quality $quality
-define webp:lossless=false
-define webp:auto-filter=true
-define webp:image-hint=graph
# "+repage" required for webp output with "-crop"
# "+0+0" is required for "-crop" otherwise it produces multiple images
# or an animated webp image with multiple frames
-crop $crop_x"x"$crop_y+0+0 +repage
# "-coalesce" is required for webp output
# https://github.com/ImageMagick/ImageMagick/issues/6041
-coalesce
);
small_convert_options=(
#"${shared_convert_options[@]}"
-scale $small_scale
-level ${lowthresh}x${highthresh}%
);
large_convert_options=(
#"${shared_convert_options[@]}"
);
# batch convert
# code to manually scan one page from flatbed
if false; then
date_time=$(date +%Y-%m-%d.%H-%M-%S)
mkdir /run/user/$(id --user) 2>/dev/null || true
temp_path="$tempdir/scan.$date_time.1.$format"
set -x
scanimage \
--device-name="$device_name" \
--resolution=$resolution \
--format=$format \
--output-file="$temp_path" \
--mode=Color \
--source="$source" \
"${extra_options[@]}"
set +x
echo "done $temp_path"
fi
# batch convert
# code to manually convert some temporary png files
if false; then
for temp_path in $tempdir/scan.$(date +"%Y-%m-%d.")*.$format;
do
bth=40;
wth=98;
scale=50%;
quality=80;
small_scale=50%;
extra_convert_options=()
#extra_convert_options=(-rotate 90)
# https://imagemagick.org/script/webp.php
# these produce large output:
# -define webp:alpha-compression=0
# -define webp:exact=true
# thresholds can produce ugly transparent output. example: scan.2023-10-03.10-31-42.1.webp
# -black-threshold $bth% -white-threshold $wth%
# level should be enough:
# -level $bth"x"$wth%
shared_convert_options=("${extra_convert_options[@]}"
-set colorspace RGB +profile '*' -quality $quality
-define webp:lossless=false
-define webp:auto-filter=true -define webp:image-hint=graph);
small_convert_options=("${shared_convert_options[@]}"
-scale $small_scale -level $bth"x"$wth%);
large_convert_options=("${shared_convert_options[@]}");
webp_small="$(basename "$temp_path" .$format).webp";
webp_large="large/$(basename "$temp_path" .$format).large.webp";
# note: convert already uses multiple cpu cores
# so dont run convert in parallel, or set MAGICK_THREAD_LIMIT=1
# https://superuser.com/questions/316365/parallel-processing-slower-than-sequential
set -x;
echo "writing $webp_large"
convert "$temp_path" "${large_convert_options[@]}" "$webp_large";
echo "writing $webp_small"
convert "$temp_path" "${small_convert_options[@]}" "$webp_small";
set +x;
done
fi
# date
date_time=$(date +%Y-%m-%d.%H-%M-%S)
mkdir /run/user/$(id --user) 2>/dev/null || true
# tempfile path format
# "%d" will be replaced by an incrementing number
temp_path_format="$tempdir/scan.$date_time.%d.$format"
# add zero-padding to the page number
# to fix the sort order of files
# without having to use "ls --sort=version" etc
# this format string is passed to printf like
# $ printf "%03d" 1
# 001
page_number_format="%03d"
# pass all args of this script to convert (TODO better?)
# example: -rotate 90
extra_convert_options=("$@")
# https://stackoverflow.com/a/30022297/10440128
# read_char var
# FIXME stty: 'standard input': Inappropriate ioctl for device
read_char() {
set -x
stty -icanon -echo
#eval "$1=\$(dd bs=1 count=1 2>/dev/null)"
# fix for input '\n'
eval "$1=\$'\\x$(dd bs=1 count=1 2>/dev/null | xxd -p)'"
# this breaks line-editing with "read"
#stty icanon echo
stty sane
set +x
}
todo_rescan_pages=""
# https://stackoverflow.com/questions/6883363/read-user-input-inside-a-loop
# https://stackoverflow.com/questions/16854280/a-variable-modified-inside-a-while-loop-is-not-remembered
# while read n <&3; do echo n=$n; read i; echo i=$i; done 3< <(seq 3)
while read temp_path <&3; do
echo "temp path: $temp_path"
# add zero-padding to the page number
# get extension
temp_path_extension="${temp_path##*.}"
# remove extension
temp_path_base="${temp_path%.*}"
# get page number
temp_path_number="${temp_path_base##*.}"
# remove page number
temp_path_base="${temp_path_base%.*}"
temp_path_new="$temp_path_base.$(printf "$page_number_format" "$temp_path_number").$temp_path_extension"
mv -v "$temp_path" "$temp_path_new"
temp_path="$temp_path_new"
# this is fancy but slow
# TODO find something better to rename and rotate images
#if false; then
# show image
# dont connect feh to stdin or stdout
# otherwise input-line-editing is broken (backspace creates ugly input)
feh --scale-down "$temp_path" </dev/zero >/dev/null 2>&1 &
feh_pid=$!
this_extra_convert_options=()
# ask user
echo " e = edit the image with gimp"
#echo " k = delete the image and rescan it later"
echo " k = delete the image"
echo " r = rotate by 90 degrees to the right = clockwise"
echo " v = rotate by 180 degrees"
echo " l = rotate by 90 degrees to the left = counter clockwise"
echo " * = continue (press any other key, like enter or space)"
echo -n "what should i do? "
#read_char response
read -n1 response
echo
case "$response" in
e)
# edit
gimp "$temp_path" &
echo "edit the image in gimp, then: file > overwrite"
echo "hit enter when done editing"
read
;;
#r) # no, "r" is too close to "e"
k)
# rescan
rm -v "$temp_path"
if false; then
echo "adding page $temp_path_number to the 'TODO rescan pages' list"
todo_rescan_pages+=" $temp_path_number"
fi
kill $feh_pid 2>/dev/null
continue
;;
r)
this_extra_convert_options+=(-rotate 90)
;;
v)
this_extra_convert_options+=(-rotate 180)
;;
l)
this_extra_convert_options+=(-rotate 270)
;;
*)
echo "continuing to process $temp_path"
esac
default_title="$(basename "$temp_path")"
# remove extension
default_title="${default_title%.*}"
# remove the "scan." prefix
# move-images.sh expects filenames like 2023-11-25.06-00.some-name
default_title="${default_title#scan.}"
if [ -n "$last_title" ]; then
# re-use the datetime of the last title
# datetime format: yyyy-mm-dd.hh-mm
default_title=$(echo "$last_title" | sed -E 's/^([0-9]{4}-[0-9]{2}-[0-9]{2}\.[0-9]{2}-[0-9]{2})\..*$/\1/')
fi
# ask user for filename
read -e -p "please enter the basename: " -i "$default_title" title
# trim the entered title
title="$(echo "$title" | sed -E 's/^[ \t\r]+//; s/[ \t\r]+$//')"
# remove "\r"
# replace whitespace with "."
title="$(echo "$title" | sed -E 's/[\r]+//g; s/[ \t]+/./g')"
if [ -z "$title" ]; then
title="$default_title"
fi
echo "using basename: ${title@Q}"
kill $feh_pid 2>/dev/null
# run "convert" processes in background
# so they run in parallel and the loop can continue
# we only have to keep the "$temp_path" files
# until all "convert" are done
# but we keep "$temp_path" anyway, so... works for now
# convert large
o="large/$title.large.webp"
[ -d large ] || mkdir -p large
echo creating "$o"
convert_args_large=(
convert
"$temp_path"
"${extra_convert_options[@]}"
"${shared_convert_options[@]}"
"${this_extra_convert_options[@]}"
"${large_convert_options[@]}"
"$o"
)
echo "${convert_args_large[@]}"
# convert small
o_small="$title.webp"
echo creating "$o_small"
convert_args_small=(
convert
"$temp_path"
"${extra_convert_options[@]}"
"${shared_convert_options[@]}"
"${this_extra_convert_options[@]}"
"${small_convert_options[@]}"
"$o_small"
)
echo "${convert_args_small[@]}"
if $keep_tempfile; then
echo keeping tempfile "$temp_path"
fi
# run "convert" in the background
# and continue with the next image
(
"${convert_args_large[@]}"
if $do_chown; then
chown $output_user_uid:$output_user_gid "$o"
fi
"${convert_args_small[@]}"
if $do_chown; then
chown $output_user_uid:$output_user_gid "$o_small"
fi
if $keep_tempfile; then
if $do_chown; then
chown $output_user_uid:$output_user_gid "$temp_path"
fi
else
rm -f "$temp_path"
fi
) &
# the original tempfile is useful
# to produce high-quality transformed images
# transformed? usually rotation by 90 / 180 / 270 degrees
# lossless rotation is only possible with jpeg images
# not with compressed image formats like webp, jp2, ...
# (png is an uncompressed image format)
# but once correctly rotated, webp gives best quality for file size
# jp2 is useful for embedding in pdf documents
# because jp2 images are smaller than jpg images
# and because pdf does not support webp images
# TODO in the future, delete the tempfile when its no longer needed
# find "$(dirname "$temp_path")" -mtime +10min -delete # ... or so
last_title="$title"
done 3< <(
# redirect stderr to log file to keep the terminal clean
# TODO better. buffer the output and print it as soon as possible, dont create a logfile
if $write_logfile; then
scanimage_log_path="$tempdir/scanimage.$(date -Is --utc).log"
echo "writing scanimage log to $scanimage_log_path" >&2
fi
#set -x
# FIXME --batch-print is not working?
scanimage_args=(
scanimage
--device-name="$device_name"
--resolution=$resolution
--format=$format
--batch="$temp_path_format"
--batch-print
--mode="$mode"
--source="$source"
"${extra_options[@]}"
)
printf "%q " "${scanimage_args[@]}" >&2; echo >&2
if $write_logfile; then
"${scanimage_args[@]}" 2>"$scanimage_log_path"
else
"${scanimage_args[@]}" 2>/dev/null
fi
)
if false; then
if [ -n "$todo_rescan_pages" ]; then
echo "TODO rescan pages:" $todo_rescan_pages
fi
fi
exit
# open result
echo opening "$o_small" ...
"${image_viewer[@]}" "$o_small" &