Bert /
Bert
# download
$ wget http://spraakbanken.gu.se/lb/resurser/meningsmangder/suc3.xml.bz2
$ wget https://raw.githubusercontent.com/EmilStenstrom/suc_to_iob/master/suc_to_iob.py
# 'resolve' ambiguities - cowardly just set them to 'O'
$ python suc_to_iob.py suc3.xml.bz2 | awk '{if (match($2,/\//)) $2 = "O"; print $1,$2}' > suc_3.0_iob.txt
# stats
$ wc -l suc_3.0_iob.txt
1240838 suc_3.0_iob.txt
$ awk '{print $2}' suc_3.0_iob.txt | sort | uniq -c | sort -nrk1,1
1094014 O
74245
15680 B-PRS
14044 I-TME
11978 B-TME
9610 B-LOC
7014 I-PRS
3640 I-MSR
3066 B-ORG
2433 B-MSR
1388 I-ORG
1016 I-LOC
911 I-WRK
753 B-WRK
370 B-EVN
305 B-OBJ
216 I-EVN
155 I-OBJ
# calculate at which lines to split
# 74245 *.7 = 51971.5
# 74245 *.9 = 66820.5
# So 70% ~ 51971 and 90% ~ 66820
# split into train, valid and text (70/20/10)
$ awk 'BEGIN {out = "train.txt"} {if (eco > 51971) out = "valid.txt"; if (eco > 66820) out = "test.txt"; print $1,$2 > out; if (NF == 0) eco=eco+1}' suc_3.0_iob.txt
$ wget http://spraakbanken.gu.se/lb/resurser/meningsmangder/suc3.xml.bz2
$ wget https://raw.githubusercontent.com/EmilStenstrom/suc_to_iob/master/suc_to_iob.py
# 'resolve' ambiguities - cowardly just set them to 'O'
$ python suc_to_iob.py suc3.xml.bz2 | awk '{if (match($2,/\//)) $2 = "O"; print $1,$2}' > suc_3.0_iob.txt
# stats
$ wc -l suc_3.0_iob.txt
1240838 suc_3.0_iob.txt
$ awk '{print $2}' suc_3.0_iob.txt | sort | uniq -c | sort -nrk1,1
1094014 O
74245
15680 B-PRS
14044 I-TME
11978 B-TME
9610 B-LOC
7014 I-PRS
3640 I-MSR
3066 B-ORG
2433 B-MSR
1388 I-ORG
1016 I-LOC
911 I-WRK
753 B-WRK
370 B-EVN
305 B-OBJ
216 I-EVN
155 I-OBJ
# calculate at which lines to split
# 74245 *.7 = 51971.5
# 74245 *.9 = 66820.5
# So 70% ~ 51971 and 90% ~ 66820
# split into train, valid and text (70/20/10)
$ awk 'BEGIN {out = "train.txt"} {if (eco > 51971) out = "valid.txt"; if (eco > 66820) out = "test.txt"; print $1,$2 > out; if (NF == 0) eco=eco+1}' suc_3.0_iob.txt
$ awk '{if (match($1,/,/)) {$1="\""$1"\""} ; if ($1 == "\"") {$1="\"\""} ; if (NF == 2) {print $1","$2} else print}' train.txt > train.csv