-

   rss_rss_hh_new

 - e-mail

 

 -

 LiveInternet.ru:
: 17.03.2011
:
:
: 51

:


, !

, 18 2017 . 02:56 +
BosonBeard 02:56

, !

    , . , , , , , ,

    , , Data Science.

    , , , .

    ( Python C# ). ( ).

    , dataset Github . , .

    .



    , .

    I: .


    , (Data Science), , Data Science , , . , , , ( ) , .

    , ( )


    GadPetrovich .

    :
    I: .
    II: ,
    III:
    IV: ,
    V:
    VI: - !
    VII:

    II: , .


    , , , .

    , , : , , PCA, T-SNE DBSCAN.
    , .

    - , , SVC , , .

    III: .


    ()?
    :


    , ( ).
    ( ) ( ), 0-180 90 270 (, ). , , 833

    , , , , .

    ? , , , .
    .ies , , .

    , , ? ?!

    , , , , .



    .ies

    IESNA:LM-63-1995
    [TEST] SL20695
    [MANUFAC] PHILIPS
    [LUMCAT]
    [LUMINAIRE] NA
    [LAMP] 3 Step Switch A60 220-240V9.5W-60W 806lm 150D 3000K-6500K Non Dim
    [BALLAST] NA
    [OTHER] B-Angle = 0.00 B-Tilt = 0.00 2015-12-07
    TILT=NONE
    1 806.00 1 181 1 1 2 -0.060 -0.060 0.120
    1.0 1.0 9.50
    0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00
    10.00 11.00 12.00 13.00 14.00 15.00 16.00 17.00 18.00 19.00
    20.00 21.00 22.00 23.00 24.00 25.00 26.00 27.00 28.00 29.00
    30.00 31.00 32.00 33.00 34.00 35.00 36.00 37.00 38.00 39.00
    40.00 41.00 42.00 43.00 44.00 45.00 46.00 47.00 48.00 49.00
    50.00 51.00 52.00 53.00 54.00 55.00 56.00 57.00 58.00 59.00
    60.00 61.00 62.00 63.00 64.00 65.00 66.00 67.00 68.00 69.00
    70.00 71.00 72.00 73.00 74.00 75.00 76.00 77.00 78.00 79.00
    80.00 81.00 82.00 83.00 84.00 85.00 86.00 87.00 88.00 89.00
    90.00 91.00 92.00 93.00 94.00 95.00 96.00 97.00 98.00 99.00
    100.00 101.00 102.00 103.00 104.00 105.00 106.00 107.00 108.00 109.00
    110.00 111.00 112.00 113.00 114.00 115.00 116.00 117.00 118.00 119.00
    120.00 121.00 122.00 123.00 124.00 125.00 126.00 127.00 128.00 129.00
    130.00 131.00 132.00 133.00 134.00 135.00 136.00 137.00 138.00 139.00
    140.00 141.00 142.00 143.00 144.00 145.00 146.00 147.00 148.00 149.00
    150.00 151.00 152.00 153.00 154.00 155.00 156.00 157.00 158.00 159.00
    160.00 161.00 162.00 163.00 164.00 165.00 166.00 167.00 168.00 169.00
    170.00 171.00 172.00 173.00 174.00 175.00 176.00 177.00 178.00 179.00
    180.00
    0.00
    137.49 137.43 137.41 137.32 137.23 137.10 136.97
    136.77 136.54 136.27 136.01 135.70 135.37 135.01
    134.64 134.27 133.85 133.37 132.93 132.42 131.93
    131.41 130.87 130.27 129.68 129.08 128.44 127.78
    127.11 126.40 125.69 124.92 124.18 123.43 122.63
    121.78 120.89 120.03 119.20 118.26 117.34 116.40
    115.46 114.49 113.53 112.56 111.52 110.46 109.42
    108.40 107.29 106.23 105.13 104.03 102.91 101.78
    100.64 99.49 98.35 97.15 95.98 94.80 93.65
    92.43 91.23 89.99 88.79 87.61 86.42 85.17
    83.96 82.76 81.49 80.31 79.13 77.91 76.66
    75.46 74.29 73.07 71.87 70.67 69.49 68.33
    67.22 66.00 64.89 63.76 62.61 61.46 60.36
    59.33 58.19 57.11 56.04 54.98 53.92 52.90
    51.84 50.83 49.82 48.81 47.89 46.88 45.92
    44.99 44.03 43.11 42.18 41.28 40.39 39.51
    38.62 37.74 36.93 36.09 35.25 34.39 33.58
    32.79 32.03 31.25 30.46 29.70 28.95 28.23
    27.48 26.79 26.11 25.36 24.71 24.06 23.40
    22.73 22.08 21.43 20.84 20.26 19.65 19.04
    18.45 17.90 17.34 16.83 16.32 15.78 15.28
    14.73 14.25 13.71 13.12 12.56 11.94 11.72
    11.19 10.51 9.77 8.66 6.96 4.13 0.91
    0.20 0.17 0.16 0.19 0.19 0.20 0.16
    0.20 0.18 0.20 0.20 0.21 0.20 0.16
    0.20 0.17 0.19 0.20 0.19 0.18


    , , .

    193 , .

    .ies , , .
    , . .

    IV: , .


    , , , - , - 0 180, 10 ( ).
    , , .


    , MS Excel, .

    , 4 , .

    , :
    (), (), () - ().
    :
    = 0-15; = 0-30; = 0-35; = 35-55;

    1; = 2; = 3; = 4; ( , )

    , , , , , 100% .

    , , -, ,

    V:


    , . .
    , .ipynb (Jupyter) GitHub

    , :

    #import libraries
    import warnings
    warnings.filterwarnings('ignore')
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.patches as mpatches
    from sklearn.model_selection import train_test_split
    from sklearn . svm import SVC
    from sklearn.metrics import f1_score
    from sklearn.ensemble import RandomForestClassifier
    from scipy.stats import uniform as sp_rand
    from sklearn.model_selection import StratifiedKFold
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.manifold import TSNE
    from IPython.display import display
    from sklearn.metrics import  accuracy_score
    from sklearn.model_selection import train_test_split
    %matplotlib inline
    
       
    
    
    #reading data
    train_df=pd.read_csv('lidc_data\\train.csv',sep='\t',index_col=None)
    test_df=pd.read_csv('lidc_data\\test.csv',sep='\t',index_col=None)
    print('train shape {0}, test shape {1}]'. format(train_df.shape, test_df.shape))
    display('train:',train_df.head(4),'test:',test_df.head(4))
    
    #divide the data and labels
    X_train=np.array(train_df.iloc[:,1:-1])
    X_test=np.array(test_df.iloc[:,1:-1])
    y_train=np.array(train_df['label'])
    y_test=np.array(test_df['label'])
    


    python, .
    , , ( ) , 4 , :



    , 4( ) 40 12 , 28 9 1/4 .
    , :
    #draw classdistributions
    
    test_n_max=test_df.label.value_counts().max()
    train_n_max=train_df.label.value_counts().max()
    
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,5))
    train_df.label.plot.hist(ax=axes[0],title='train data class distribution', bins=7,yticks=np.arange(0,train_n_max+2,2), xticks=np.unique(train_df.label.values))
    test_df.label.plot.hist(ax=axes[1],ti
    


    :



    , , , , , .

    . , scikit-learn ( ), , , !

    ( ), , 0 1.

    - , ( ), , - ( ), .

    , , .

    #scaled all data for final prediction
    scl=MinMaxScaler()
    X_train_scl=scl.fit_transform(X_train.T).T
    X_test_scl=scl.fit_transform(X_test.T).T
    
    #scaled part of data for test
    X_train_part, X_test_part, y_train_part, y_test_part = train_test_split(X_train, y_train, test_size=0.20, stratify=y_train, random_state=42)
    
    scl=MinMaxScaler()
    X_train_part_scl=scl.fit_transform(X_train_part.T).T
    X_test_part_scl=scl.fit_transform(X_test_part.T).T
    


    , , scikit-learn , fit, , transform ( 2 1), predict () .

    1 , , , , kagle, ?

    , , ( ) ( ), . , , , .

    
    #not scaled
    x=np.arange(0,190,10)
    plt.figure(figsize=(17,10))
    plt.plot(x,X_train[13])
    plt.plot(x,X_train[109])
    plt.plot(x,X_train[68])
    plt.plot(x,X_train[127])
    
    c1 = mpatches.Patch( color='blue', label='class 1')
    c2 = mpatches.Patch( color='green', label='class 2')
    c3 = mpatches.Patch(color='orange', label='class 3')
    c4 = mpatches.Patch( color='red',label='class 4')
    
    
    plt.legend(handles=[c1,c2,c3,c4])
    
    plt.xlabel('  (polar angle)')
    plt.ylabel('  (luminous intensity)')
    plt.title('   ')
    


    ( subplot)
    
    #scaled
    x=np.arange(0,190,10)
    plt.figure(figsize=(17,10))
    plt.legend()
    plt.plot(x,X_train_scl[13],color='blue')
    plt.plot(x,X_train_scl[109],color='green')
    plt.plot(x,X_train_scl[68], color='orange')
    plt.plot(x,X_train_scl[127], color='red')
    
    c1 = mpatches.Patch( color='blue', label='class 1')
    c2 = mpatches.Patch( color='green', label='class 2')
    c3 = mpatches.Patch(color='orange', label='class 3')
    c4 = mpatches.Patch( color='red',label='class 4')
    
    
    plt.legend(handles=[c1,c2,c3,c4])
    
    plt.xlabel('  (polar angle)')
    plt.ylabel('  (luminous intensity)')
    plt.title('    ()')
    




    - , - .

    , , , ( t-SNE)


    #T-SNE 
    colors = ["#190aff", "#0fff0f",  "#ff641e" , "#ff3232"]
    tsne = TSNE(random_state=42)
    d_tsne = tsne.fit_transform(X_train)
    plt.figure(figsize=(10, 10))
    plt.xlim(d_tsne[:, 0].min(), d_tsne[:, 0].max() + 10)
    plt.ylim(d_tsne[:, 1].min(), d_tsne[:, 1].max() + 10)
    for i in range(len(X_train)):
        #  ,      
        plt.text(d_tsne[i, 0], d_tsne[i, 1], str(y_train[i]),
        color = colors[y_train[i]-1],
        fontdict={'weight': 'bold', 'size': 10})
    plt.xlabel("t-SNE feature 0")
    plt.ylabel("t-SNE feature 1")
    


    ( subplot)
    #T-SNE for scaled data
    d_tsne = tsne.fit_transform(X_train_scl)
    plt.figure(figsize=(10, 10))
    plt.xlim(d_tsne[:, 0].min(), d_tsne[:, 0].max() + 10)
    plt.ylim(d_tsne[:, 1].min(), d_tsne[:, 1].max() + 10)
    for i in range(len(X_train_scl)):
        #  ,      
        plt.text(d_tsne[i, 0], d_tsne[i, 1], str(y_train[i]),
        color = colors[y_train[i]-1],
        fontdict={'weight': 'bold', 'size': 10})
    plt.xlabel("t-SNE feature 0")
    plt.ylabel("t-SNE feature 1")
    


    , .



    , , 1, 3, 4 , 2 1 3 ( , )

    VI: - !



    SVC

    #predict part of full data (test labels the part of X_train)
    #not scaled
    svm = SVC(kernel= 'rbf', random_state=42 , gamma=2, C=1.1)
    svm.fit (X_train_part, y_train_part)
    pred=svm.predict(X_test_part)
    print("\n not scaled: \n results (pred, real): \n",list(zip(pred,y_test_part)))
    print('not scaled: accuracy = {}, f1-score= {}'.format( accuracy_score(y_test_part,pred), f1_score(y_test_part,pred, average='macro')))
    #scaled
    svm = SVC(kernel= 'rbf', random_state=42 , gamma=2, C=1.1)
    svm.fit (X_train_part_scl, y_train_part)
    pred=svm.predict(X_test_part_scl)
    print("\n scaled: \n results (pred, real): \n",list(zip(pred,y_test_part)))
    print('scaled: accuracy = {}, f1-score= {}'.format( accuracy_score(y_test_part,pred), f1_score(y_test_part,pred, average='macro')))
    




    not scaled:
    results (pred, real):
    [(2, 3), (2, 3), (2, 2), (3, 3), (2, 1), (2, 3), (2, 3), (2, 2), (2, 1), (2, 2), (1, 1), (3, 3), (2, 2), (2, 1), (2, 4), (3, 3), (2, 2), (2, 4), (2, 1), (2, 2), (4, 4), (2, 2), (4, 4), (2, 4), (2, 3), (2, 1), (2, 1), (2, 1), (2, 2)]
    not scaled: accuracy = 0.4827586206896552, f1-score= 0.46380859284085096



    scaled:
    results (pred, real):
    [(3, 3), (3, 3), (2, 2), (3, 3), (1, 1), (3, 3), (3, 3), (2, 2), (1, 1), (2, 2), (1, 1), (3, 3), (2, 2), (1, 1), (4, 4), (3, 3), (2, 2), (4, 4), (1, 1), (2, 2), (4, 4), (2, 2), (4, 4), (4, 4), (3, 3), (1, 1), (1, 1), (1, 1), (2, 2)]
    scaled: accuracy = 1.0, f1-score= 1.0


    ,
    , 100% .

    , 100%

    #final predict full data
    svm.fit (X_train_scl, y_train)
    pred=svm.predict(X_test_scl)
    print("\n results (pred, real): \n",list(zip(pred,y_test)))
    print('scaled: accuracy = {}, f1-score= {}'.format( accuracy_score(y_test,pred), f1_score(y_test,pred, average='macro')))
    




    results (pred, real):
    [(1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4)]
    scaled: accuracy = 1.0, f1-score= 1.0


    ! , accuracy f1-score, , ( )

    , RandomForest
    , , RandomForest , .
    rfc=RandomForestClassifier(random_state=42,n_jobs=-1, n_estimators=100)
    
    rfc=rfc.fit(X_train, y_train)
    rpred=rfc.predict(X_test)
    print("\n not scaled: \n results (pred, real): \n",list(zip(rpred,y_test)))
    print('not scaled: accuracy = {}, f1-score= {}'.format( accuracy_score(y_test,rpred), f1_score(y_test,rpred, average='macro')))
    
    
    rfc=rfc.fit(X_train_scl, y_train)
    rpred=rfc.predict(X_test_scl)
    print("\n scaled: \n results (pred, real): \n",list(zip(rpred,y_test)))
    print('scaled: accuracy = {}, f1-score= {}'.format( accuracy_score(y_test,rpred), f1_score(y_test,rpred, average='macro')))
    

    :
    not scaled:
    results (pred, real):
    [(1, 1), (1, 1), (2, 1), (1, 1), (1, 1), (2, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (2, 2), (2, 2), (2, 2), (1, 2), (2, 2), (2, 2), (2, 2), (2, 2), (3, 2), (2, 2), (3, 2), (2, 2), (4, 2), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (4, 3), (3, 3), (3, 3), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4)]
    not scaled: accuracy = 0.8541666666666666, f1-score= 0.8547222222222222

    scaled:
    results (pred, real):
    [(1, 1), (1, 1), (2, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4), (4, 4)]
    scaled: accuracy = 0.9791666666666666, f1-score= 0.9807407407407408


    , 2 :
    1. .
    2. SVC .

    , scv , Random Forest 10- , ( ). , 100%, .

    , , , , ,

    VII: .


    , , - .
    - , .



    -! ;)

    !
    Original source: habrahabr.ru (comments, light).

    https://habrahabr.ru/post/338124/

    :  

    : [1] []
     

    :
    : 

    : ( )

    :

      URL