-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathsetup_opensubs.sh
executable file
·47 lines (37 loc) · 1.46 KB
/
setup_opensubs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/bin/bash
set -e
url='http://opus.nlpl.eu/download.php?f=OpenSubtitles2018/de-en.txt.zip'
scripts=conversion_scripts
echo "| Downloading and unzipping corpus..."
# download OpenSubtitles 2018 EN-DE and unzip
if [ `ls OpenSubtitles.* 2> /dev/null | wc -l` == 3 ]; then
echo "| - OpenSubtitles seems to be downloaded and unzipped already. To repeat download, remove one of ['OpenSubtitles.de-en.ids', 'OpenSubtitles.de-en.en', 'OpenSubtitles.de-en.de']."
else
wget $url -O source.de-en.zip
unzip -o source.de-en.zip
fi
echo "| Extracting documents from XML files..."
# extract documents
if [ ! -d documents ]; then
mkdir documents
perl $scripts/opusXML2docs.pl --ids OpenSubtitles.de-en.ids --l1 en --l2 de --outdir documents --source OpenSubtitles.de-en.en --target OpenSubtitles.de-en.de
else
echo "| - Documents seem to be extracted already. To repeat extraction, remove the folder 'documents'."
fi
echo "| Organize into folders by year and clean up..."
# organize into folders
if [ ! -d documents/1916 ]; then
for file in documents/*; do
mkdir -p -- "${file%%_*}"
mv -- "$file" "${file%%_*}"
done
else
echo "| - Documents seem to be organized by year already. To repeat, remove all subfolders documents/*"
fi
# remove a stray document pair
rm -rf documents/1191
# remove source files - comment out if you would like to keep them
rm -f source.de-en.zip
rm -f OpenSubtitles.de-en.{de,en,ids}
rm -f doc.order.en-de.txt
rm -f README