Распознавание речи на STM32F4-Discovery

Автор работы: Пользователь скрыл имя, 11 Октября 2013 в 17:10, курсовая работа

Краткое описание

Также стоит отметить статью Мел-кепстральные коэффициенты (MFCC) и распознавание речи и выполненную на её основе работу по идентификации человека по голосу: Кто там? — Идентификация человека по голосу.
В данной работе предлагается простой алгоритм (и его реализация на C++) системы распознавания речи по короткому словарю, основанный на анализе статистического распределения мел-кепстральных коэффициентов (Mel-frequency cepstrum coefficients, MFCC).

Прикрепленные файлы: 1 файл

Пояснювальна записка .doc

— 447.00 Кб (Скачать документ)

fileMFCC.read((char*)&freq, sizeof(freq));

 

if ((fileSize - 10) % (sizeof(float) * dim) != 0)

ThrowFormatException("can't read file %s", (path + m_files[i][j]).c_str());

 

if (m_mfcc.GetDim() == 0)

m_mfcc.SetDim((int)dim);

else if (m_mfcc.GetDim() != (int)dim)

ThrowFormatException("file %s has wrong dim: %d", (path + m_files[i][j]).c_str(), (int)dim);

 

int vectorsNum = (int)((fileSize - 10) / (sizeof(float) * dim));

if (vectorsNum == 0)

ThrowFormatException("file %s is empty");

 

m_mfcc.SetVectorsNum(filesCount, vectorsNum);

float* pVectors;

m_mfcc.GetVectors(filesCount, vectorsNum, pVectors);

fileMFCC.read((char*)pVectors, sizeof(float) * vectorsNum * m_mfcc.GetDim());

filesCount++;

 

if (fileMFCC.fail())

ThrowFormatException("can't read file %s", (path + m_files[i][j]).c_str());

fileMFCC.close();

}

}

m_baseOK = true;

}

 

ResultStatus CreateMeans(const int &_maxItersNum, const int &_meansNum, const int &_minVectorsNum)

{

if (!m_baseOK) throw exception("load base first");

if (_meansNum <= 0) throw exception("means num <= 0");

 

meansNum = _meansNum;

m_dim = m_mfcc.GetDim();

m_means.resize(meansNum * m_dim);

ResultStatus result = WordRecognizer::KMeans(m_mfcc, _maxItersNum, meansNum, _minVectorsNum, &m_means[0]);

m_meansOK = true;

return result;

}

 

void SetAdaptWeight(const float &_weight)

{

m_adaptWeight = _weight;

}

 

void MakeAdaptation()

{

if (!m_baseOK) throw exception("load base first");

if (!m_meansOK || (m_means.size() == 0)) throw exception("means are not created");

 

int superVectorDim = meansNum * m_mfcc.GetDim();

if ((int)m_means.size() != superVectorDim) throw exception("wrong means size");

 

// Группируем адаптированные средние  значения по принадлежности одному  слову

m_adaptMeans.SetDim(superVectorDim);

m_adaptMeans.SetSize((int)m_files.size());

int count = 0;

for (int i = 0; i < m_adaptMeans.GetSize(); i++)

{

m_adaptMeans.SetVectorsNum(i, (int)m_files[i].size());

count += (int)m_files[i].size();

}

if (count != m_mfcc.GetSize()) throw exception("m_mfcc.GetSize() != sum of m_files[i].size()");

 

// Проводим адаптацию для всех  файлов

count = 0;

int adaptVectorsNum = 0;

float* adaptResult = NULL;

int mfccVectorsNum = 0;

float* mfccData = NULL;

for (int i = 0; i < m_adaptMeans.GetSize(); i++)

{

m_adaptMeans.GetVectors(i, adaptVectorsNum, adaptResult);

for (int j = 0; j < adaptVectorsNum; j++, adaptResult += m_adaptMeans.GetDim(), count++)

{

m_mfcc.GetVectors(count, mfccVectorsNum, mfccData);

WordRecognizer::Adaptation(m_mfcc.GetDim(), mfccVectorsNum, meansNum, &m_means[0], m_adaptWeight, mfccData, adaptResult);

}

}

m_adaptMeansOK = true;

}

 

ResultStatus CreateLDA(const int &_LDASize)

{

if (!m_adaptMeansOK) throw exception("make adaptation first");

 

m_LDASize = _LDASize;

int dim = m_adaptMeans.GetDim();

if (dim <= 0) throw exception("adapt means size <= 0");

m_LDAMatrix.resize(m_LDASize * dim);

m_mean.resize(dim);

ResultStatus result = WordRecognizer::CreateLDAMatrix(m_adaptMeans, m_LDASize, &m_LDAMatrix[0], &m_mean[0]);

 

m_LDAOK = true;

return result;

}

 

void MakeProjections()

{

if (!m_adaptMeansOK) throw exception("make adaptation first");

if (!m_LDAOK) throw exception("create LDA first");

 

m_projections.SetDim(m_LDASize);

m_projections.SetSize(m_adaptMeans.GetSize());

int adaptDim = m_adaptMeans.GetDim();

 

for (int i = 0; i < m_adaptMeans.GetSize(); i++)

{

int vectorsNum = 0;

float* pAdaptVectors = NULL;

float* pProjectionVectors = NULL;

m_adaptMeans.GetVectors(i, vectorsNum, pAdaptVectors);

m_projections.SetVectorsNum(i, vectorsNum);

m_projections.GetVectors(i, vectorsNum, pProjectionVectors);

 

for (int j = 0; j < vectorsNum; j++, pAdaptVectors += adaptDim, pProjectionVectors += m_LDASize)

WordRecognizer::MakeProjection(m_LDASize, m_adaptMeans.GetDim(), &m_LDAMatrix[0], &m_mean[0],  pAdaptVectors, pProjectionVectors);

}

m_projectionsOK = true;

}

 

ResultStatus CreateCompareSystem(const float &_minSigma)

{

if (!m_baseOK) throw exception("load base first");

if (!m_projectionsOK) throw exception("make projections first");

int dim = m_projections.GetDim();

int wordsNum = m_projections.GetSize();

 

if (wordsNum != (int)m_wordsBase.size())

throw exception("word num in base != projection clusters");

 

m_learnWords = m_wordsBase;

m_wordsMean.resize(wordsNum * dim);

m_wordsSigma.resize(wordsNum);

ResultStatus result = WordRecognizer::CreateCompareSystem(m_projections, &m_wordsMean[0], &m_wordsSigma[0], _minSigma);

m_compareSystemOK = true;

return result;

}

 

void SaveSystem(const char* _fileName)

{

ofstream stateFile(_fileName, ios_base::out | ios_base::binary | ios_base::trunc);

if (!stateFile.is_open())

ThrowFormatException("can't open file %s", _fileName);

 

// means

char boolChar = (m_meansOK) ? 0x01 : 0x00;

stateFile.write(&boolChar, sizeof(boolChar));

stateFile.write((char*)&m_dim, sizeof(m_dim));

stateFile.write((char*)&meansNum, sizeof(meansNum));

if (meansNum * m_dim > 0)

stateFile.write((char*)&m_means[0], sizeof(m_means[0]) * meansNum * m_dim);

 

// adaptation

stateFile.write((char*)&m_adaptWeight, sizeof(m_adaptWeight));

 

// LDA

boolChar = (m_LDAOK) ? 0x01 : 0x00;

stateFile.write(&boolChar, sizeof(boolChar));

stateFile.write((char*)&m_LDASize, sizeof(m_LDASize));

if (m_LDASize * m_dim * meansNum > 0)

{

stateFile.write((char*)&m_LDAMatrix[0], sizeof(m_LDAMatrix[0]) * m_LDASize * m_dim * meansNum);

stateFile.write((char*)&m_mean[0], sizeof(m_mean[0]) * m_dim * meansNum);

}

 

// compare system

boolChar = (m_compareSystemOK) ? 0x01 : 0x00;

stateFile.write(&boolChar, sizeof(boolChar));

int wordsNum = m_learnWords.size();

stateFile.write((char*)&wordsNum, sizeof(wordsNum));

if (wordsNum > 0)

{

for (int i = 0; i < wordsNum; i++)

{

int len = m_learnWords[i].length();

stateFile.write((char*)&len, sizeof(len));

stateFile.write(m_learnWords[i].c_str(), len);

}

stateFile.write((char*)&m_wordsMean[0], sizeof(m_wordsMean[0]) * wordsNum * m_LDASize);

stateFile.write((char*)&m_wordsSigma[0], sizeof(m_wordsSigma[0]) * wordsNum);

}

 

if (stateFile.fail())

ThrowFormatException("fail write file %s", _fileName);

}

 

void LoadSystem(const char* _fileName)

{

ifstream stateFile(_fileName, ios_base::in | ios_base::binary);

if (!stateFile.is_open())

ThrowFormatException("can't open file %s", _fileName);

 

// means

char boolChar;

stateFile.read(&boolChar, sizeof(boolChar));

m_meansOK = (boolChar != 0);

stateFile.read((char*)&m_dim, sizeof(m_dim));

stateFile.read((char*)&meansNum, sizeof(meansNum));

m_means.resize(meansNum * m_dim);

if (meansNum * m_dim > 0)

stateFile.read((char*)&m_means[0], sizeof(m_means[0]) * meansNum * m_dim);

 

// adaptation

stateFile.read((char*)&m_adaptWeight, sizeof(m_adaptWeight));

 

// LDA

stateFile.read(&boolChar, sizeof(boolChar));

m_LDAOK = (boolChar != 0);

stateFile.read((char*)&m_LDASize, sizeof(m_LDASize));

m_LDAMatrix.resize(m_LDASize * m_dim * meansNum);

m_mean.resize(m_dim * meansNum);

if (m_LDASize * m_dim * meansNum > 0)

{

stateFile.read((char*)&m_LDAMatrix[0], sizeof(m_LDAMatrix[0]) * m_LDASize * m_dim * meansNum);

stateFile.read((char*)&m_mean[0], sizeof(m_mean[0]) * m_dim * meansNum);

}

 

// compare system

stateFile.read(&boolChar, sizeof(boolChar));

m_compareSystemOK = (boolChar != 0);

int wordsNum = 0;

stateFile.read((char*)&wordsNum, sizeof(wordsNum));

m_learnWords.resize(wordsNum);

m_wordsMean.resize(wordsNum * m_LDASize);

m_wordsSigma.resize(wordsNum);

if (wordsNum > 0)

{

vector<char> tmpStr;

for (int i = 0; i < wordsNum; i++)

{

int len = 0;

stateFile.read((char*)&len, sizeof(len));

tmpStr.resize(len + 1);

stateFile.read(&tmpStr[0], len);

tmpStr[len] = 0x00;

m_learnWords[i] = &tmpStr[0];

}

stateFile.read((char*)&m_wordsMean[0], sizeof(m_wordsMean[0]) * wordsNum * m_LDASize);

stateFile.read((char*)&m_wordsSigma[0], sizeof(m_wordsSigma[0]) * wordsNum);

}

 

if (stateFile.fail())

ThrowFormatException("fail read file %s", _fileName);

}

 

void Test(const char* _resultsFileName, bool _useSigma, double &_wer)

{

if (!m_baseOK) throw exception("load base first");

if (!m_projectionsOK) throw exception("make projections first");

if (!m_compareSystemOK) throw exception("create compare system first");

 

ofstream resultsFile(_resultsFileName);

if (!resultsFile.is_open())

ThrowFormatException("can't open file %s", _resultsFileName);

 

int learnedWordsNum = (int)m_learnWords.size();

resultsFile << "words: ";

for (int i = 0; i < learnedWordsNum; i++)

resultsFile << "\t" << m_learnWords[i];

resultsFile<<endl;

 

int correctNum = 0;

int attemptsNum = 0;

vector<int> answers(learnedWordsNum);

vector<float> probability(learnedWordsNum);

int dim = m_projections.GetDim();

for (int i = 0; i < m_projections.GetSize(); i++)

{

string testWordName = m_wordsBase[i];

float* pTestVector = NULL;

int vectorsNum = 0;

m_projections.GetVectors(i, vectorsNum, pTestVector);

for (int k = 0; k < learnedWordsNum; k++)

answers[k] = 0;

for (int j = 0; j < vectorsNum; j++, pTestVector += dim)

{

attemptsNum++;

WordRecognizer::GetProbability(dim, learnedWordsNum, &m_wordsMean[0], &m_wordsSigma[0], pTestVector, _useSigma, &probability[0]);

float maxP = 0.;

int maxPindx = 0;

for (int k = 0; k < learnedWordsNum; k++)

{

if (probability[k] > maxP)

{

maxP = probability[k];

maxPindx = k;

}

}

answers[maxPindx]++;

if (m_learnWords[maxPindx].compare(testWordName) == 0)

correctNum++;

}

resultsFile << testWordName << ": ";

for (int k = 0; k < learnedWordsNum; k++)

resultsFile << "\t" << answers[k];

resultsFile << endl;

}

 

resultsFile << endl;

resultsFile << "attempts num: " << attemptsNum << endl;

resultsFile << "correct num: " << correctNum << endl;

_wer = 100. * (double)(attemptsNum - correctNum) / attemptsNum;

resultsFile << "word error rate: " << _wer << " %" << endl;

}

 

};

 

struct Params

{

string baseFile;   //!< Файл с описанием базы (тестирования или обучения)

string systemFile;   //!< Файл хранящий результат обучения системы

string testResultsFile;  //!< Файл с результатами тестирования

bool learn;     //!< Обучать систему?

int meansNum;    //!< Количество средних значений

int kmeansMaxIters;   //!< Максимальное число итераций алгоритма K-средних

int minVectorsNumForKMeans; //!< Наименьшее число векторов, соответствующих одному среднему значению

float adaptWeight;   //!< "Чувствительность" адаптации

int LDASize;    //!< Размерность LDA-матрицы

float sigmaFloor;   //!< Минимальное значение СКО

bool useSigma;    //!< Использовать нормировку на внутриклассовое СКО при тестировании

Params()

{

learn = false;

meansNum = 10;

kmeansMaxIters = 2000;

minVectorsNumForKMeans = 10;

adaptWeight = 20.f;

LDASize = 20;

sigmaFloor = 0.001f;

useSigma = false;

}

};

 

void PrintHelp(Params &_params)

{

cout << "--help           Show full list of parameters." << endl << endl;

cout << "--base           File with base description (required). File format:" << endl;

cout << "                 [WORDS NUM]" << endl;

cout << "                 [WORD 1 NAME]" << endl;

cout << "                 [FILES NUM OF WORD 1]" << endl;

cout << "                 [FILE 1 OF WORD 1]" << endl;

cout << "                 [FILE 2 OF WORD 1]" << endl;

cout << "                 ............" << endl;

cout << "                 [WORD 2 NAME]" << endl;

cout << "                 [FILES NUM OF WORD 2]" << endl;

cout << "                 [FILE 1 OF WORD 2]" << endl;

cout << "                 [FILE 2 OF WORD 2]" << endl;

cout << "                 ............" << endl << endl;

cout << "--system         File containing leaned system results (required)." << endl << endl;

cout << "--test_results   File with testing results (required)." << endl << endl;

cout << "--learn          Learn system ("; if (_params.learn) cout << "enabled"; else cout << "disabled"; cout <<" by default)." << endl << endl;

cout << "--means          Means number (" << _params.meansNum << " by default)." << endl << endl;

cout << "--max_iters      Maximum iterations number for K-means algorithm (" << _params.kmeansMaxIters << " by default)." << endl << endl;

cout << "--min_vectors    Minimum vectors number corresponds to one mean (" << _params.minVectorsNumForKMeans << " by default)." << endl << endl;

cout << "--adapt_weight   Adaptation weight (" << _params.adaptWeight << " by default)." << endl << endl;

cout << "--lda            LDA-matrix size (" << _params.LDASize << " by default)." << endl << endl;

cout << "--use_sigma      Using within-class deviation normalization in testing ("; if (_params.useSigma) cout << "enabled"; else cout << "disabled"; cout << " by default)." << endl << endl;

cout << "--sigma_floor    Standart deviation minimum value (" << _params.sigmaFloor << " by default)." << endl << endl;

 

}

 

bool ReadCommandLine(int argc, char* argv[],

Params &_params)

{

for (int arg = 1; arg < argc; arg++)

{

if (strcmp(argv[arg], "--help") == 0)

{

PrintHelp(_params);

return false;

}

else if (strcmp(argv[arg], "--base") == 0)

{

if (argc <= arg + 1)

{

cout << "Can't read value for parameter '--base'" << endl;

return false;

}

arg++;

_params.baseFile = argv[arg];

}

else if (strcmp(argv[arg], "--system") == 0)

{

if (argc <= arg + 1)

{

cout << "Can't read value for parameter '--system'" << endl;

return false;

}

arg++;

_params.systemFile = argv[arg];

}

else if (strcmp(argv[arg], "--test_results") == 0)

{

if (argc <= arg + 1)

{

cout << "Can't read value for parameter '--test_results'" << endl;

return false;

}

arg++;

_params.testResultsFile = argv[arg];

}

else if (strcmp(argv[arg], "--learn") == 0)

{

_params.learn = true;

}

else if (strcmp(argv[arg], "--means") == 0)

{

if (argc <= arg + 1)

{

cout << "Can't read value for parameter '--means'" << endl;

return false;

}

arg++;

_params.meansNum = atoi(argv[arg]);

}

else if (strcmp(argv[arg], "--max_iters") == 0)

{

if (argc <= arg + 1)

{

cout << "Can't read value for parameter '--max_iters'" << endl;

return false;

}

arg++;

_params.kmeansMaxIters = atoi(argv[arg]);

}

else if (strcmp(argv[arg], "--min_vectors") == 0)

{

if (argc <= arg + 1)

{

cout << "Can't read value for parameter '--min_vectors'" << endl;

return false;

}

arg++;

_params.minVectorsNumForKMeans = atoi(argv[arg]);

}

else if (strcmp(argv[arg], "--adapt_weight") == 0)

{

if (argc <= arg + 1)

{

cout << "Can't read value for parameter '--adapt_weight'" << endl;

return false;

}

arg++;

_params.adaptWeight = (float)atof(argv[arg]);

}

else if (strcmp(argv[arg], "--lda") == 0)

{

if (argc <= arg + 1)

{

cout << "Can't read value for parameter '--lda'" << endl;

return false;

}

arg++;

_params.LDASize = atoi(argv[arg]);

}

else if (strcmp(argv[arg], "--use_sigma") == 0)

{

_params.useSigma = true;

}

else if (strcmp(argv[arg], "--sigma_floor") == 0)

{

if (argc <= arg + 1)

{

cout << "Can't read value for parameter '--floor'" << endl;

return false;

}

arg++;

_params.sigmaFloor = (float)atof(argv[arg]);

}

else

{

cout << "Can't parse parameter " << argv[arg] << endl;

return false;

}

}

if (_params.baseFile.length() == 0)

{

cout << "Parameter '--base' not founded. Use '--help' for the full list of parameters." << endl;

return false;

}

if (_params.systemFile.length() == 0)

{

cout << "Parameter '--system' not founded. Use '--help' for the full list of parameters." << endl;

return false;

}

if (_params.baseFile.length() == 0)

{

cout << "Parameter '--base' not founded. Use '--help' for the full list of parameters." << endl;

return false;

}

return true;

}

 

int main(int argc, char* argv[])

{

 

Params params;

if (!ReadCommandLine(argc, argv, params))

return -1;

 

try

{

ResultStatus status(RS_SUCCESS);

WordRecognizerWrapper wrapper;

double WER = 100;

 

if (params.learn)

{

cout << "Load learn base... ";

wrapper.LoadBase(params.baseFile.c_str());

cout << "OK" << endl;

 

cout << "Create means... ";

status = wrapper.CreateMeans(params.kmeansMaxIters, params.meansNum, params.minVectorsNumForKMeans);

if (status != RS_SUCCESS)

{

cout << endl << "WARNING: " << GetMessageFromStatus(status) << endl;

}

cout << "OK" << endl;

 

 

cout << "Adaptation... ";

wrapper.SetAdaptWeight(params.adaptWeight);

wrapper.MakeAdaptation();

cout << "OK" << endl;

 

cout << "LDA... ";

status = wrapper.CreateLDA(params.LDASize);

if (status != RS_SUCCESS)

{

cout << endl << "WARNING: " << GetMessageFromStatus(status) << endl;

}

cout << "OK" << endl;

 

cout << "Make projections... ";

wrapper.MakeProjections();

cout << "OK" << endl;

 

cout << "Create compare system... ";

status = wrapper.CreateCompareSystem(params.sigmaFloor);

cout << "OK" << endl;

 

cout << "Save system... ";

wrapper.SaveSystem(params.systemFile.c_str());

cout << "OK" << endl;

 

cout << "Testing learn base... ";

wrapper.Test(params.testResultsFile.c_str(), params.useSigma, WER);

cout << "OK" << endl;

}

else

{

cout << "Load system... ";

wrapper.LoadSystem(params.systemFile.c_str());

cout << "OK" << endl;

 

cout << "Load test base... ";

wrapper.LoadBase(params.baseFile.c_str());

cout << "OK" << endl;

 

cout << "Adaptation... ";

wrapper.MakeAdaptation();

cout << "OK" << endl;

 

cout << "Make projections... ";

wrapper.MakeProjections();

cout << "OK" << endl;

 

cout << "Testing test base... ";

wrapper.Test(params.testResultsFile.c_str(), params.useSigma, WER);

Информация о работе Распознавание речи на STM32F4-Discovery