20200406

bp1.txt - freqs from the first video from MH coca_freq.txt - freqs from the COCA corpus <https://www.wordfrequency.info/free.asp?s=y>

cat bp1.txt | sed 's/ /\n/g' | sed 's/\(.*\)/\L\1/' | sort | uniq -c | awk '{print $2,$1}' | sort -k 1,1 > bp_wf.txt

awk '{print $2,$4}' coca_freq.txt | sort -k 1,1 > coca_wf.txt

cat * | sed 's/ /\n/g' | sed 's/^\s*//g' | sed 's/\/.*$//g' | sed 's/\(.*\)/\L\1/' | sort | uniq -c | awk '{print $2,$1}' | sort -k 1,1 > ../brown_wf.txt

coca=450000000 brown=1196221 bp=26282

(148)% ./merge -k -e "NA" brown_wf.txt bp_wf.txt | grep -v NA | sort -nrk2,2 | awk -v c=1196221 -v d=26282 '{

(148)% ./merge -k -e "NA" coca_wf.txt bp_wf.txt | grep -v NA | sort -nrk2,2 | awk -v c=450000000 -v d=26282 '{

    a=$2
    b=$3
    E1 = c*(a+b) / (c+d)
    E2 = d*(a+b) / (c+d)

    p1 = a*log (a/E1)
    p2 = b*log (b/E2)
    if (a == 0) {p1=0}
    if (b == 0) {p2=0}

    G2 = 2*(p1 + p2)

    if (G2 >= 3.84) {
        if (p1 > p2) {
            print $1, -G2
        } else {
            print $1, G2
        }
    }

}' | sort -nk2,2 | awk '{print $2,$1}' > [brown,coca]_bp.txt