Bert

# download
$ wget http://spraakbanken.gu.se/lb/resurser/meningsmangder/suc3.xml.bz2
$ wget https://raw.githubusercontent.com/EmilStenstrom/suc_to_iob/master/suc_to_iob.py

# 'resolve' ambiguities - cowardly just set them to 'O'
$ python suc_to_iob.py suc3.xml.bz2 | awk '{if (match($2,/\//)) $2 = "O"; print $1,$2}' > suc_3.0_iob.txt

# stats
$ wc -l suc_3.0_iob.txt
1240838 suc_3.0_iob.txt

$ awk '{print $2}' suc_3.0_iob.txt | sort | uniq -c | sort -nrk1,1
1094014 O
  74245
  15680 B-PRS
  14044 I-TME
  11978 B-TME
   9610 B-LOC
   7014 I-PRS
   3640 I-MSR
   3066 B-ORG
   2433 B-MSR
   1388 I-ORG
   1016 I-LOC
    911 I-WRK
    753 B-WRK
    370 B-EVN
    305 B-OBJ
    216 I-EVN
    155 I-OBJ

# calculate at which lines to split
# 74245 *.7 = 51971.5                                                                                                                                                            
# 74245 *.9 = 66820.5
# So 70% ~ 51971 and 90% ~ 66820

# split into train, valid and text (70/20/10)
$ awk 'BEGIN {out = "train.txt"} {if (eco > 51971) out = "valid.txt"; if (eco > 66820) out = "test.txt"; print $1,$2 > out; if (NF == 0) eco=eco+1}' suc_3.0_iob.txt
$ awk '{if (match($1,/,/)) {$1="\""$1"\""} ; if ($1 == "\"") {$1="\"\""} ; if (NF == 2) {print $1","$2} else print}' train.txt > train.csv