How to distinguish human languages by letter frequency histogram?

How to find without dictionaries a language of a text sample? It can be accomplished by comparing frequencies of letters in a sample and in known languages. For example in polish 'a' letter is about 0.0551146 of all letters, in french it's 0.049458 and in german 0.0434701.

I created a small program in C++ that takes as an argument path to file with unknown language and print, how this language differs from languages that he knowns. The lowest result is the best match. Frequencies of letters in known languages are computed from files in samples directory.

Below is the output for checked sample and this sample (in French):

bash-3.2$ ./a.out test3.txt 
difference between Polish language: 0.0965482
difference between French language: 0.0442431
difference between German language: 0.0945827
Au contraire de Józef Piłsudski, qui rêvait d'une grande fédération slave alliée à l'Allemagne contre la Russie, Roman Dmowski était désireux de fonder une Pologne alliée à la Russie : il a ainsi soutenu l'idée d'une association avec l'Empire russe, puis, malgré son anticommunisme, avec l'Union soviétique, qu'il considère malgré tout comme un prolongement de la politique des tsars. De même, bien qu'agnostique, Dmowski ne concevait l'existence d'un État polonais que sur une base catholique, et ethniquement homogène, ce qui impliquait pour les populations allogènes (Baltes, Ukrainiens, Allemands et mêmes Juifs) l'expulsion ou l'assimilation, rejetant ainsi le concept de République des Deux Nations, avancé par la gauche et les libéraux.

Below is the source of used program.

#include <iostream>
#include <string>
#include <algorithm>
#include <fstream>
#include <sstream>
#include <vector>
#include <map>
#include <math.h>

using namespace std;

class Language {
public:
    Language (vector<string> filenames) {
        for (vector<string>::iterator i = filenames.begin(); i != filenames.end(); i++) {
            // read whole file into string
            ifstream t(i->c_str());
            stringstream buffer;
            buffer << t.rdbuf();
            string data = buffer.str();
            // make content of file lower case
            transform(data.begin(), data.end(), data.begin(), ::tolower);
            sample += data;
        }
        computeLettersDistribution();
    }

    map<char, double> getLettersDistribution() {
        return lettersDistribution;
    }

    double compare(Language a) {
        double error = 0.0;
        for (map<char, double>::iterator i = lettersDistribution.begin(); i != lettersDistribution.end(); i++) {
            error += pow((*i).second - a.getLettersDistribution()[(*i).first], 2);
        }
        return pow(error, 0.5);
    }

protected:
    void computeLettersDistribution() {
        for (char c = 'a'; c <= 'z'; c++) {
            lettersDistribution[c] = 0;
        }

        for (string::iterator c = sample.begin(); c != sample.end(); c++) { // over letters in sample
            if (*c >= 'a' && *c <= 'z') {
                lettersDistribution[*c]++;
            }
        }

        for (map<char, double>::iterator d = lettersDistribution.begin(); d!= lettersDistribution.end(); d++) {
            lettersDistribution[ (*d).first ]  = (*d).second / sample.length();
        }
    }

    string sample;
    map<char, double> lettersDistribution; // key is a letter, value is quantity of this letter in file
};

int main(int argc, char* argv[]) {

    if (2 != argc) {
        cout << "usage: " << argv[0] << " filename" <<endl;
        return EXIT_SUCCESS;
    }

    vector<string> samplesPL; // vector of filenames
    samplesPL.push_back("samples/polski_1.txt");
    samplesPL.push_back("samples/polski_2.txt");
    Language polish(samplesPL);

    vector<string> samplesFR; // vector of filenames
    samplesFR.push_back("samples/francais_1.txt");
    samplesFR.push_back("samples/francais_2.txt");
    samplesFR.push_back("samples/francais_3.txt");
    Language francais(samplesFR);

    vector<string> samplesDE; // vector of filenames
    samplesDE.push_back("samples/deutsch_1.txt");
    samplesDE.push_back("samples/deutsch_2.txt");
    Language deutsch(samplesDE);
 
    vector<string> investigatedSamples; // vector of filenames
    investigatedSamples.push_back(argv[1]);
    Language investigated(investigatedSamples);

    cout << "difference between Polish language: " << investigated.compare(polish) << endl;
    cout << "difference between French language: " << investigated.compare(francais) << endl;
    cout << "difference between German language: " << investigated.compare(deutsch) << endl;
}

0 commentaires:

Post a Comment