Автор работы: Пользователь скрыл имя, 11 Октября 2013 в 17:10, курсовая работа
Также стоит отметить статью Мел-кепстральные коэффициенты (MFCC) и распознавание речи и выполненную на её основе работу по идентификации человека по голосу: Кто там? — Идентификация человека по голосу.
В данной работе предлагается простой алгоритм (и его реализация на C++) системы распознавания речи по короткому словарю, основанный на анализе статистического распределения мел-кепстральных коэффициентов (Mel-frequency cepstrum coefficients, MFCC).
fileMFCC.read((char*)&freq, sizeof(freq));
if ((fileSize - 10) % (sizeof(float) * dim) != 0)
ThrowFormatException("can't read file %s", (path + m_files[i][j]).c_str());
if (m_mfcc.GetDim() == 0)
m_mfcc.SetDim((int)dim);
else if (m_mfcc.GetDim() != (int)dim)
ThrowFormatException("file %s has wrong dim: %d", (path + m_files[i][j]).c_str(), (int)dim);
int vectorsNum = (int)((fileSize - 10) / (sizeof(float) * dim));
if (vectorsNum == 0)
ThrowFormatException("file %s is empty");
m_mfcc.SetVectorsNum(
float* pVectors;
m_mfcc.GetVectors(filesCount, vectorsNum, pVectors);
fileMFCC.read((char*)pVectors, sizeof(float) * vectorsNum * m_mfcc.GetDim());
filesCount++;
if (fileMFCC.fail())
ThrowFormatException("can't read file %s", (path + m_files[i][j]).c_str());
fileMFCC.close();
}
}
m_baseOK = true;
}
ResultStatus CreateMeans(const int &_maxItersNum, const int &_meansNum, const int &_minVectorsNum)
{
if (!m_baseOK) throw exception("load base first");
if (_meansNum <= 0) throw exception("means num <= 0");
meansNum = _meansNum;
m_dim = m_mfcc.GetDim();
m_means.resize(meansNum * m_dim);
ResultStatus result = WordRecognizer::KMeans(m_mfcc, _maxItersNum, meansNum, _minVectorsNum, &m_means[0]);
m_meansOK = true;
return result;
}
void SetAdaptWeight(const float &_weight)
{
m_adaptWeight = _weight;
}
void MakeAdaptation()
{
if (!m_baseOK) throw exception("load base first");
if (!m_meansOK || (m_means.size() == 0)) throw exception("means are not created");
int superVectorDim = meansNum * m_mfcc.GetDim();
if ((int)m_means.size() != superVectorDim) throw exception("wrong means size");
// Группируем адаптированные
m_adaptMeans.SetDim(
m_adaptMeans.SetSize((int)m_
int count = 0;
for (int i = 0; i < m_adaptMeans.GetSize(); i++)
{
m_adaptMeans.SetVectorsNum(i, (int)m_files[i].size());
count += (int)m_files[i].size();
}
if (count != m_mfcc.GetSize()) throw exception("m_mfcc.GetSize() != sum of m_files[i].size()");
// Проводим адаптацию для всех файлов
count = 0;
int adaptVectorsNum = 0;
float* adaptResult = NULL;
int mfccVectorsNum = 0;
float* mfccData = NULL;
for (int i = 0; i < m_adaptMeans.GetSize(); i++)
{
m_adaptMeans.GetVectors(i, adaptVectorsNum, adaptResult);
for (int j = 0; j < adaptVectorsNum; j++, adaptResult += m_adaptMeans.GetDim(), count++)
{
m_mfcc.GetVectors(count, mfccVectorsNum, mfccData);
WordRecognizer::Adaptation(m_
}
}
m_adaptMeansOK = true;
}
ResultStatus CreateLDA(const int &_LDASize)
{
if (!m_adaptMeansOK) throw exception("make adaptation first");
m_LDASize = _LDASize;
int dim = m_adaptMeans.GetDim();
if (dim <= 0) throw exception("adapt means size <= 0");
m_LDAMatrix.resize(m_LDASize * dim);
m_mean.resize(dim);
ResultStatus result = WordRecognizer::
m_LDAOK = true;
return result;
}
void MakeProjections()
{
if (!m_adaptMeansOK) throw exception("make adaptation first");
if (!m_LDAOK) throw exception("create LDA first");
m_projections.SetDim(m_
m_projections.SetSize(m_
int adaptDim = m_adaptMeans.GetDim();
for (int i = 0; i < m_adaptMeans.GetSize(); i++)
{
int vectorsNum = 0;
float* pAdaptVectors = NULL;
float* pProjectionVectors = NULL;
m_adaptMeans.GetVectors(i, vectorsNum, pAdaptVectors);
m_projections.SetVectorsNum(i, vectorsNum);
m_projections.GetVectors(i, vectorsNum, pProjectionVectors);
for (int j = 0; j < vectorsNum; j++, pAdaptVectors += adaptDim, pProjectionVectors += m_LDASize)
WordRecognizer::
}
m_projectionsOK = true;
}
ResultStatus CreateCompareSystem(const float &_minSigma)
{
if (!m_baseOK) throw exception("load base first");
if (!m_projectionsOK) throw exception("make projections first");
int dim = m_projections.GetDim();
int wordsNum = m_projections.GetSize();
if (wordsNum != (int)m_wordsBase.size())
throw exception("word num in base != projection clusters");
m_learnWords = m_wordsBase;
m_wordsMean.resize(wordsNum * dim);
m_wordsSigma.resize(wordsNum);
ResultStatus result = WordRecognizer::
m_compareSystemOK = true;
return result;
}
void SaveSystem(const char* _fileName)
{
ofstream stateFile(_fileName, ios_base::out | ios_base::binary | ios_base::trunc);
if (!stateFile.is_open())
ThrowFormatException("can't open file %s", _fileName);
// means
char boolChar = (m_meansOK) ? 0x01 : 0x00;
stateFile.write(&boolChar, sizeof(boolChar));
stateFile.write((char*)&m_dim, sizeof(m_dim));
stateFile.write((char*)&
if (meansNum * m_dim > 0)
stateFile.write((char*)&m_
// adaptation
stateFile.write((char*)&m_
// LDA
boolChar = (m_LDAOK) ? 0x01 : 0x00;
stateFile.write(&boolChar, sizeof(boolChar));
stateFile.write((char*)&m_
if (m_LDASize * m_dim * meansNum > 0)
{
stateFile.write((char*)&m_
stateFile.write((char*)&m_
}
// compare system
boolChar = (m_compareSystemOK) ? 0x01 : 0x00;
stateFile.write(&boolChar, sizeof(boolChar));
int wordsNum = m_learnWords.size();
stateFile.write((char*)&
if (wordsNum > 0)
{
for (int i = 0; i < wordsNum; i++)
{
int len = m_learnWords[i].length();
stateFile.write((char*)&len, sizeof(len));
stateFile.write(m_learnWords[i
}
stateFile.write((char*)&m_
stateFile.write((char*)&m_
}
if (stateFile.fail())
ThrowFormatException("fail write file %s", _fileName);
}
void LoadSystem(const char* _fileName)
{
ifstream stateFile(_fileName, ios_base::in | ios_base::binary);
if (!stateFile.is_open())
ThrowFormatException("can't open file %s", _fileName);
// means
char boolChar;
stateFile.read(&boolChar, sizeof(boolChar));
m_meansOK = (boolChar != 0);
stateFile.read((char*)&m_dim, sizeof(m_dim));
stateFile.read((char*)&
m_means.resize(meansNum * m_dim);
if (meansNum * m_dim > 0)
stateFile.read((char*)&m_
// adaptation
stateFile.read((char*)&m_
// LDA
stateFile.read(&boolChar, sizeof(boolChar));
m_LDAOK = (boolChar != 0);
stateFile.read((char*)&m_
m_LDAMatrix.resize(m_LDASize * m_dim * meansNum);
m_mean.resize(m_dim * meansNum);
if (m_LDASize * m_dim * meansNum > 0)
{
stateFile.read((char*)&m_
stateFile.read((char*)&m_mean[
}
// compare system
stateFile.read(&boolChar, sizeof(boolChar));
m_compareSystemOK = (boolChar != 0);
int wordsNum = 0;
stateFile.read((char*)&
m_learnWords.resize(wordsNum);
m_wordsMean.resize(wordsNum * m_LDASize);
m_wordsSigma.resize(wordsNum);
if (wordsNum > 0)
{
vector<char> tmpStr;
for (int i = 0; i < wordsNum; i++)
{
int len = 0;
stateFile.read((char*)&len, sizeof(len));
tmpStr.resize(len + 1);
stateFile.read(&tmpStr[0], len);
tmpStr[len] = 0x00;
m_learnWords[i] = &tmpStr[0];
}
stateFile.read((char*)&m_
stateFile.read((char*)&m_
}
if (stateFile.fail())
ThrowFormatException("fail read file %s", _fileName);
}
void Test(const char* _resultsFileName, bool _useSigma, double &_wer)
{
if (!m_baseOK) throw exception("load base first");
if (!m_projectionsOK) throw exception("make projections first");
if (!m_compareSystemOK) throw exception("create compare system first");
ofstream resultsFile(_resultsFileName);
if (!resultsFile.is_open())
ThrowFormatException("can't open file %s", _resultsFileName);
int learnedWordsNum = (int)m_learnWords.size();
resultsFile << "words: ";
for (int i = 0; i < learnedWordsNum; i++)
resultsFile << "\t" << m_learnWords[i];
resultsFile<<endl;
int correctNum = 0;
int attemptsNum = 0;
vector<int> answers(learnedWordsNum);
vector<float> probability(learnedWordsNum);
int dim = m_projections.GetDim();
for (int i = 0; i < m_projections.GetSize(); i++)
{
string testWordName = m_wordsBase[i];
float* pTestVector = NULL;
int vectorsNum = 0;
m_projections.GetVectors(i, vectorsNum, pTestVector);
for (int k = 0; k < learnedWordsNum; k++)
answers[k] = 0;
for (int j = 0; j < vectorsNum; j++, pTestVector += dim)
{
attemptsNum++;
WordRecognizer::
float maxP = 0.;
int maxPindx = 0;
for (int k = 0; k < learnedWordsNum; k++)
{
if (probability[k] > maxP)
{
maxP = probability[k];
maxPindx = k;
}
}
answers[maxPindx]++;
if (m_learnWords[maxPindx].
correctNum++;
}
resultsFile << testWordName << ": ";
for (int k = 0; k < learnedWordsNum; k++)
resultsFile << "\t" << answers[k];
resultsFile << endl;
}
resultsFile << endl;
resultsFile << "attempts num: " << attemptsNum << endl;
resultsFile << "correct num: " << correctNum << endl;
_wer = 100. * (double)(attemptsNum - correctNum) / attemptsNum;
resultsFile << "word error rate: " << _wer << " %" << endl;
}
};
struct Params
{
string baseFile; //!< Файл с описанием базы (тестирования или обучения)
string systemFile; //!< Файл хранящий результат обучения системы
string testResultsFile; //!< Файл с результатами тестирования
bool learn; //!< Обучать систему?
int meansNum; //!< Количество средних значений
int kmeansMaxIters; //!< Максимальное число итераций алгоритма K-средних
int minVectorsNumForKMeans; //!< Наименьшее число векторов, соответствующих одному среднему значению
float adaptWeight; //!< "Чувствительность" адаптации
int LDASize; //!< Размерность LDA-матрицы
float sigmaFloor; //!< Минимальное значение СКО
bool useSigma; //!< Использовать нормировку на внутриклассовое СКО при тестировании
Params()
{
learn = false;
meansNum = 10;
kmeansMaxIters = 2000;
minVectorsNumForKMeans = 10;
adaptWeight = 20.f;
LDASize = 20;
sigmaFloor = 0.001f;
useSigma = false;
}
};
void PrintHelp(Params &_params)
{
cout << "--help Show full list of parameters." << endl << endl;
cout << "--base File with base description (required). File format:" << endl;
cout << " [WORDS NUM]" << endl;
cout << " [WORD 1 NAME]" << endl;
cout << " [FILES NUM OF WORD 1]" << endl;
cout << " [FILE 1 OF WORD 1]" << endl;
cout << " [FILE 2 OF WORD 1]" << endl;
cout << " ............" << endl;
cout << " [WORD 2 NAME]" << endl;
cout << " [FILES NUM OF WORD 2]" << endl;
cout << " [FILE 1 OF WORD 2]" << endl;
cout << " [FILE 2 OF WORD 2]" << endl;
cout << " ............" << endl << endl;
cout << "--system File containing leaned system results (required)." << endl << endl;
cout << "--test_results File with testing results (required)." << endl << endl;
cout << "--learn Learn system ("; if (_params.learn) cout << "enabled"; else cout << "disabled"; cout <<" by default)." << endl << endl;
cout << "--means Means number (" << _params.meansNum << " by default)." << endl << endl;
cout << "--max_iters Maximum iterations number for K-means algorithm (" << _params.kmeansMaxIters << " by default)." << endl << endl;
cout << "--min_vectors Minimum vectors number corresponds to one mean (" << _params.minVectorsNumForKMeans << " by default)." << endl << endl;
cout << "--adapt_weight Adaptation weight (" << _params.adaptWeight << " by default)." << endl << endl;
cout << "--lda LDA-matrix size (" << _params.LDASize << " by default)." << endl << endl;
cout << "--use_sigma Using within-class deviation normalization in testing ("; if (_params.useSigma) cout << "enabled"; else cout << "disabled"; cout << " by default)." << endl << endl;
cout << "--sigma_floor Standart deviation minimum value (" << _params.sigmaFloor << " by default)." << endl << endl;
}
bool ReadCommandLine(int argc, char* argv[],
Params &_params)
{
for (int arg = 1; arg < argc; arg++)
{
if (strcmp(argv[arg], "--help") == 0)
{
PrintHelp(_params);
return false;
}
else if (strcmp(argv[arg], "--base") == 0)
{
if (argc <= arg + 1)
{
cout << "Can't read value for parameter '--base'" << endl;
return false;
}
arg++;
_params.baseFile = argv[arg];
}
else if (strcmp(argv[arg], "--system") == 0)
{
if (argc <= arg + 1)
{
cout << "Can't read value for parameter '--system'" << endl;
return false;
}
arg++;
_params.systemFile = argv[arg];
}
else if (strcmp(argv[arg], "--test_results") == 0)
{
if (argc <= arg + 1)
{
cout << "Can't read value for parameter '--test_results'" << endl;
return false;
}
arg++;
_params.testResultsFile = argv[arg];
}
else if (strcmp(argv[arg], "--learn") == 0)
{
_params.learn = true;
}
else if (strcmp(argv[arg], "--means") == 0)
{
if (argc <= arg + 1)
{
cout << "Can't read value for parameter '--means'" << endl;
return false;
}
arg++;
_params.meansNum = atoi(argv[arg]);
}
else if (strcmp(argv[arg], "--max_iters") == 0)
{
if (argc <= arg + 1)
{
cout << "Can't read value for parameter '--max_iters'" << endl;
return false;
}
arg++;
_params.kmeansMaxIters = atoi(argv[arg]);
}
else if (strcmp(argv[arg], "--min_vectors") == 0)
{
if (argc <= arg + 1)
{
cout << "Can't read value for parameter '--min_vectors'" << endl;
return false;
}
arg++;
_params.minVectorsNumForKMeans = atoi(argv[arg]);
}
else if (strcmp(argv[arg], "--adapt_weight") == 0)
{
if (argc <= arg + 1)
{
cout << "Can't read value for parameter '--adapt_weight'" << endl;
return false;
}
arg++;
_params.adaptWeight = (float)atof(argv[arg]);
}
else if (strcmp(argv[arg], "--lda") == 0)
{
if (argc <= arg + 1)
{
cout << "Can't read value for parameter '--lda'" << endl;
return false;
}
arg++;
_params.LDASize = atoi(argv[arg]);
}
else if (strcmp(argv[arg], "--use_sigma") == 0)
{
_params.useSigma = true;
}
else if (strcmp(argv[arg], "--sigma_floor") == 0)
{
if (argc <= arg + 1)
{
cout << "Can't read value for parameter '--floor'" << endl;
return false;
}
arg++;
_params.sigmaFloor = (float)atof(argv[arg]);
}
else
{
cout << "Can't parse parameter " << argv[arg] << endl;
return false;
}
}
if (_params.baseFile.length() == 0)
{
cout << "Parameter '--base' not founded. Use '--help' for the full list of parameters." << endl;
return false;
}
if (_params.systemFile.length() == 0)
{
cout << "Parameter '--system' not founded. Use '--help' for the full list of parameters." << endl;
return false;
}
if (_params.baseFile.length() == 0)
{
cout << "Parameter '--base' not founded. Use '--help' for the full list of parameters." << endl;
return false;
}
return true;
}
int main(int argc, char* argv[])
{
Params params;
if (!ReadCommandLine(argc, argv, params))
return -1;
try
{
ResultStatus status(RS_SUCCESS);
WordRecognizerWrapper wrapper;
double WER = 100;
if (params.learn)
{
cout << "Load learn base... ";
wrapper.LoadBase(params.
cout << "OK" << endl;
cout << "Create means... ";
status = wrapper.CreateMeans(params.
if (status != RS_SUCCESS)
{
cout << endl << "WARNING: " << GetMessageFromStatus(status) << endl;
}
cout << "OK" << endl;
cout << "Adaptation... ";
wrapper.SetAdaptWeight(params.
wrapper.MakeAdaptation();
cout << "OK" << endl;
cout << "LDA... ";
status = wrapper.CreateLDA(params.
if (status != RS_SUCCESS)
{
cout << endl << "WARNING: " << GetMessageFromStatus(status) << endl;
}
cout << "OK" << endl;
cout << "Make projections... ";
wrapper.MakeProjections();
cout << "OK" << endl;
cout << "Create compare system... ";
status = wrapper.CreateCompareSystem(
cout << "OK" << endl;
cout << "Save system... ";
wrapper.SaveSystem(params.
cout << "OK" << endl;
cout << "Testing learn base... ";
wrapper.Test(params.
cout << "OK" << endl;
}
else
{
cout << "Load system... ";
wrapper.LoadSystem(params.
cout << "OK" << endl;
cout << "Load test base... ";
wrapper.LoadBase(params.
cout << "OK" << endl;
cout << "Adaptation... ";
wrapper.MakeAdaptation();
cout << "OK" << endl;
cout << "Make projections... ";
wrapper.MakeProjections();
cout << "OK" << endl;
cout << "Testing test base... ";
wrapper.Test(params.