eu data and the shell scripts used to create it

This commit is contained in:
i.ortega 2020-05-03 01:23:27 +02:00
commit 68e5f235a1
6 changed files with 1517551 additions and 0 deletions

1019629
data/eu.txt Normal file

File diff suppressed because it is too large Load Diff

497854
data/eu_train.tsv Normal file

File diff suppressed because it is too large Load Diff

50
preprocessing/clean.sh Normal file
View File

@ -0,0 +1,50 @@
#!/usr/bin/env sh
temp="/tmp/clean-hp-tempXXXXXXXXXX"
tempname=$(mktemp "$temp")
trap 'rm "$temp"' INT TERM QUIT
quote_removal() {
previous_line=""
merge=""
while read -r line; do
if echo "$previous_line" | grep -q '^"'; then
if echo "$line" | grep -q '^"'; then
merge="$previous_line $(echo "$line" | cut -c 2-)"
else
merge="$previous_line $line"
fi
fi
if echo "$merge" | grep -Eq '^"[^"]+"[\.!]?$'; then
echo "$merge"
previous_line=""
else
if echo "$line" | grep -q '^"'; then
if [ -n "$merge" ]; then
previous_line="$merge"
elif echo "$line" | grep -Eq '"[\.!]?$'; then
echo "$line" | sed -E 's|(")([\.!])$|\2|'
previous_line=""
else
previous_line="$line"
fi
elif echo "$line" | grep -Eq '"[\.!]?$'; then
echo "$merge"
previous_line=""
else
[ -n "$previous_line" ] && echo "$previous_line"
echo "$line"
previous_line=""
fi
fi
merge=""
done < "${1:-/dev/stdin}"
}
sed -E 's|^--?||;s|^"?||;s|—||;s|―||;s|^_||;s|^ ||' < "${1:-/dev/stdin}" |
uniq | quote_removal | sed -E 's|^"+ ?||' > "$tempname"
sh until-no-change.sh "sed '/,$/ {N; s/\n/ /g;}' < '$tempname' | uniq" \
'sed "/,$/ {N; s/\n/ /g;}" | uniq'
rm "$tempname"

View File

@ -0,0 +1,3 @@
#!/usr/bin/env sh
sh clean.sh < "${1:-/dev/stdin}" | paste - -

View File

@ -0,0 +1,12 @@
#!/usr/bin/env sh
current=$(mktemp '/tmp/until-no-change-currentXXXXXXX')
prev=$(mktemp '/tmp/until-no-change-prevXXXXXXX')
sh -c "$1" > "$current"
while [ -n "$(diff "$current" "$prev")" ]; do
cp "$current" "$prev"
sh -c "cat '$prev' | $2 > '$current'"
done
cat "${current}"

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
torch==1.1.0
torchtext==0.5.0
tqdm