-

   rss_rss_hh_new

 - e-mail

 

 -

 LiveInternet.ru:
: 17.03.2011
:
:
: 51

:


vs :

, 26 2017 . 11:16 +
, image processing , . , , .

, . , , , , , . , .

/ justin lincoln / CC-BY


,

, . , : , , , ( pitch, ). 65-260 , 100-525 . , .

, . , . , , , , , , .

, . - , .

, - , , . 25-50 , , . , , , , .

- (MFCC). , , . , , . -, , , (). -, , . , 12 - , ( , ).

(pitch, , MFCC) .


/ Daniel Oines / CC-BY


. , , ImageNet IMDB . TIMIT, ( ), VCTK 7 . , : 109 . 4 1-5 .



. 8 96 , 8 : . , Wavenet , , . , , . openSMILE , , .

Python, Random Forest, , sklearn. , , .

, , - . , , , .

, . , - . , , - . .

, , . , , , , . , .. speaker free, . , : , .

.

data.csv, , .

, :

import csv, os
import numpy as np
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import GroupKFold

# read data
with open('data.csv', 'r')as c:
	r = csv.reader(c, delimiter=',')
	header = r.next()
	data = []
	for row in r:
		data.append(row)
data = np.array(data)

# preprocess
genders = data[:, 0].astype(int)
speakers = data[:, 1].astype(int)
filenames = data[:, 2]
times = data[:, 3].astype(float)
pitch = data[:, 4:5].astype(float)
features = data[:, 4:].astype(float)

- . sklearn GroupKFold : - , . , . .

def subject_cross_validation(clf, x, y, subj, folds):
	gkf = GroupKFold(n_splits=folds)
	scores = []
	for train, test in gkf.split(x, y, groups=subj):
		clf.fit(x[train], y[train])
		scores.append(clf.score(x[test], y[test]))
	return np.mean(scores)

, . . , , . ( + + mfcc):

# classify frames separately
score_frames_pitch = subject_cross_validation(RFC(n_estimators=100), pitch, genders, speakers, 5) 
print 'Frames classification on pitch, accuracy:', score_frames_pitch
score_frames_features = subject_cross_validation(RFC(n_estimators=100), features, genders, speakers, 5) 
print 'Frames classification on all features, accuracy:', score_frames_features

66 73% . , , , 50%. 64% . : (, ), . , : , .

, . , :

def make_sample(x, y, subj, names, statistics=[np.mean, np.std, np.median, np.min, np.max]):
	avx = []
	avy = []
	avs = []
	keys = np.unique(names)
	for k in keys:
		idx = names == k
		v = []
		for stat in statistics:
			v += stat(x[idx], axis=0).tolist()
		avx.append(v)
		avy.append(y[idx][0])
		avs.append(subj[idx][0])
	return np.array(avx), np.array(avy).astype(int), np.array(avs).astype(int)

# average features for each frame
average_features, average_genders, average_speakers = make_sample(features, genders, speakers, filenames)
average_pitch, average_genders, average_speakers = make_sample(pitch, genders, speakers, filenames)

. , , :

# train models on pitch and on all features
score_pitch = subject_cross_validation(RFC(n_estimators=100), average_pitch, average_genders, average_speakers, 5) 
print 'Utterance classification on pitch, accuracy:', score_pitch
score_features = subject_cross_validation(RFC(n_estimators=100), average_features, average_genders, average_speakers, 5) 
print 'Utterance classification on features, accuracy:', score_features

97,2% , . , :

# skip all frames without pitch
filter_idx = pitch[:, 0] > 1
filtered_average_features, filtered_average_genders, filtered_average_speakers = make_sample(features[filter_idx], genders[filter_idx], speakers[filter_idx], filenames[filter_idx])
score_filtered = subject_cross_validation(RFC(n_estimators=100), filtered_average_features, filtered_average_genders, filtered_average_speakers, 5) 
print 'Utterance classification an averaged features over filtered frames, accuracy:', score_filtered

, 98.4% . ( ), , , .


. , 1-2% , -, , , . , ,

, , , .

:


.



Original source: habrahabr.ru (comments, light).

https://habrahabr.ru/post/334136/

:  

: [1] []
 

:
: 

: ( )

:

  URL