-

   rss_rss_hh_new

 - e-mail

 

 -

 LiveInternet.ru:
: 17.03.2011
:
:
: 51

:


R c H2O Spark HDInsight

, 07 2017 . 09:50 +

imageH2O , , , H2O Spark. H2O Spark, Azure HDInsight, ( : R Spark) H2O (, ) sparklyr, H2O - Spark?


H20 HDInsight Spark


, R Spark, , :
1) sparklyr, , dplyr-
2) R Server for Hadoop, Microsoft,
3) SparkR, ( , Spark 2.2 )


1 .


H2O Spark HDInsight. :


  1. -, H2O , () Spark H20
  2. , sparklyr, , :
    • Generalized Linear Model
    • Multilayer Perceptron
    • Random Forest
    • Gradient Boosting Machine
    • Naive-Bayes
    • Principal Components Analysis
    • Singular Value Decomposition
    • Generalized Low Rank Model
    • K-Means Clustering
    • Anomaly Detection via Deep Learning Autoencoder
  3. , h2oEnsemble
  4. H2O , , ,
  5. Java "Plain Old Java Object" (POJO)

, H2O :


  1. , sparklyr
  2. , , , sparklyr replyr
  3. H2O , rsparkling
  4. , h2o
  5. Spark / R, rsparkling / sparklyr


  • H2O Artificial Intelligence for HDInsight 2.0.2
    API Python Scala. R ( ) , , :
  • R sparklyr, h2o, rsparkling :
  • RStudio
  • putty , ssh RStudio RStudio -

: h2o , , Spark, rsparkling, rsparkling sparklingwater ( options(rsparkling.sparklingwater.version = '2.0.8'. . , , Action Script.

H2O Artificial Intelligence for HDInsight, , 2 D12v2 4 D12v2 1 Sparkling water (). , ssh , R, RStudio ( RStudio Spark ), . (R ) Action Script. , : . ssh , localhost:8787. , localhost:8787 RStudio .


R , Shiny , - flexdashboard, , , , , web-, ( ).


, , . hdfs, ( ).



- sparklyr, h2o, , . , , ( ), 30%, RMSE. .1 .1.


.1.


RMSE ,
lm_mllib 1,2507 10
lm_h2o 1,2507 5,6
rf_mllib 1,2669 21,9
rf_h2o 1,2531 13,4
gbm_mllib 1,2553 108,3
gbm_h2o 1,2343 24,9

image
.1


, h2o sparklyr, , . h2o gbm, RMSE. , -, , h2o .



, R H2O Spark, HDInsight, sparklyr, sparklyr .


###  ( )    

features<-c("vendor_id",
            "passenger_count",
            "trip_time_in_secs",
            "trip_distance",
            "fare_amount",
            "surcharge")

rmse <- function(formula, data) {
  data %>%
    mutate_(residual = formula) %>%
    summarize(rmse = sqr(mean(residual ^ 2))) %>%
    collect %>%
    .[["rmse"]]
}

trips_train_tbl <- sdf_register(taxi_filtered$training, "trips_train")
trips_test_tbl <- sdf_register(taxi_filtered$test, "trips_test")

actual <- trips.test.tbl %>%
  select(tip_amount) %>%
  collect() %>%
  `[[`("tip_amount")

tbl_cache(sc, "trips_train")
tbl_cache(sc, "trips_test")

trips_train_h2o_tbl <- as_h2o_frame(sc, trips_train_tbl)
trips_test_h2o_tbl <- as_h2o_frame(sc, trips_test_tbl)
trips_train_h2o_tbl$vendor_id <- as.factor(trips_train_h2o_tbl$vendor_id)
trips_test_h2o_tbl$vendor_id <- as.factor(trips_test_h2o_tbl$vendor_id)

#mllib  
lm_mllib <- ml_linear_regression(x=trips_train_tbl, response = "tip_amount", features = features)
pred_lm_mllib <- sdf_predict(lm_mllib, trips_test_tbl)

rf_mllib <- ml_random_forest(x=trips_train_tbl, response = "tip_amount", features = features)
pred_rf_mllib <- sdf_predict(rf_mllib, trips_test_tbl)

gbm_mllib <-ml_gradient_boosted_trees(x=trips_train_tbl, response = "tip_amount", features = features)
pred_gbm_mllib <- sdf_predict(gbm_mllib, trips_test_tbl)

#h2o
lm_h2o <- h2o.glm(x =features, y = "tip_amount", trips_train_h2o_tbl) 
pred_lm_h2o <- h2o.predict(lm_h2o, trips_test_h2o_tbl)

rf_h2o <- h2o.randomForest(x =features, y = "tip_amount", trips_train_h2o_tbl,ntrees=20,max_depth=5)
pred_rf_h2o <- h2o.predict(rf_h2o, trips_test_h2o_tbl)

gbm_h2o <- h2o.gbm(x =features, y = "tip_amount", trips_train_h2o_tbl)
pred_gbm_h2o <- h2o.predict(gbm_h2o, trips_test_h2o_tbl)

####
pred.h2o <- data.frame(
  tip.amount = actual,
  as.data.frame(pred_lm_h2o),
  as.data.frame(pred_rf_h2o),
  as.data.frame(pred_gbm_h2o),
)
colnames(pred.h2o)<-c("tip.amount", "lm", "rf", "gbm")

result <- data.frame(

  RMSE = c(
    lm.mllib = rmse(~ tip_amount - prediction, pred_lm_mllib),
    lm.h2o = rmse(~ tip.amount - lm, pred.h2o ),
    rf.mllib = rmse(~ tip.amount - prediction, pred_rf_mllib),
    rf.h2o = rmse(~ tip_amount - rf, pred.h2o),
    gbm.mllib = rmse(~ tip_amount - prediction, pred_gbm_mllib),
    gbm.h2o = rmse(~ tip.amount - gbm, pred.h2o)

  )
)
Original source: habrahabr.ru (comments, light).

https://habrahabr.ru/post/334898/

:  

: [1] []
 

:
: 

: ( )

:

  URL