R c H2O Spark HDInsight |
H2O , , , H2O Spark. H2O Spark, Azure HDInsight, ( : R Spark) H2O (, ) sparklyr, H2O - Spark?
, R Spark, , :
1) sparklyr, , dplyr-
2) R Server for Hadoop, Microsoft,
3) SparkR, ( , Spark 2.2 )
1 .
H2O Spark HDInsight. :
, H2O :
: h2o , , Spark, rsparkling, rsparkling sparklingwater ( options(rsparkling.sparklingwater.version = '2.0.8'. . , , Action Script.
H2O Artificial Intelligence for HDInsight, , 2 D12v2 4 D12v2 1 Sparkling water (). , ssh , R, RStudio ( RStudio Spark ), . (R ) Action Script. , : . ssh , localhost:8787. , localhost:8787 RStudio .
R , Shiny , - flexdashboard, , , , , web-, ( ).
, , . hdfs, ( ).
- sparklyr, h2o, , . , , ( ), 30%, RMSE. .1 .1.
.1.
RMSE | , | |
---|---|---|
lm_mllib | 1,2507 | 10 |
lm_h2o | 1,2507 | 5,6 |
rf_mllib | 1,2669 | 21,9 |
rf_h2o | 1,2531 | 13,4 |
gbm_mllib | 1,2553 | 108,3 |
gbm_h2o | 1,2343 | 24,9 |
.1
, h2o sparklyr, , . h2o gbm, RMSE. , -, , h2o .
, R H2O Spark, HDInsight, sparklyr, sparklyr .
### ( )
features<-c("vendor_id",
"passenger_count",
"trip_time_in_secs",
"trip_distance",
"fare_amount",
"surcharge")
rmse <- function(formula, data) {
data %>%
mutate_(residual = formula) %>%
summarize(rmse = sqr(mean(residual ^ 2))) %>%
collect %>%
.[["rmse"]]
}
trips_train_tbl <- sdf_register(taxi_filtered$training, "trips_train")
trips_test_tbl <- sdf_register(taxi_filtered$test, "trips_test")
actual <- trips.test.tbl %>%
select(tip_amount) %>%
collect() %>%
`[[`("tip_amount")
tbl_cache(sc, "trips_train")
tbl_cache(sc, "trips_test")
trips_train_h2o_tbl <- as_h2o_frame(sc, trips_train_tbl)
trips_test_h2o_tbl <- as_h2o_frame(sc, trips_test_tbl)
trips_train_h2o_tbl$vendor_id <- as.factor(trips_train_h2o_tbl$vendor_id)
trips_test_h2o_tbl$vendor_id <- as.factor(trips_test_h2o_tbl$vendor_id)
#mllib
lm_mllib <- ml_linear_regression(x=trips_train_tbl, response = "tip_amount", features = features)
pred_lm_mllib <- sdf_predict(lm_mllib, trips_test_tbl)
rf_mllib <- ml_random_forest(x=trips_train_tbl, response = "tip_amount", features = features)
pred_rf_mllib <- sdf_predict(rf_mllib, trips_test_tbl)
gbm_mllib <-ml_gradient_boosted_trees(x=trips_train_tbl, response = "tip_amount", features = features)
pred_gbm_mllib <- sdf_predict(gbm_mllib, trips_test_tbl)
#h2o
lm_h2o <- h2o.glm(x =features, y = "tip_amount", trips_train_h2o_tbl)
pred_lm_h2o <- h2o.predict(lm_h2o, trips_test_h2o_tbl)
rf_h2o <- h2o.randomForest(x =features, y = "tip_amount", trips_train_h2o_tbl,ntrees=20,max_depth=5)
pred_rf_h2o <- h2o.predict(rf_h2o, trips_test_h2o_tbl)
gbm_h2o <- h2o.gbm(x =features, y = "tip_amount", trips_train_h2o_tbl)
pred_gbm_h2o <- h2o.predict(gbm_h2o, trips_test_h2o_tbl)
####
pred.h2o <- data.frame(
tip.amount = actual,
as.data.frame(pred_lm_h2o),
as.data.frame(pred_rf_h2o),
as.data.frame(pred_gbm_h2o),
)
colnames(pred.h2o)<-c("tip.amount", "lm", "rf", "gbm")
result <- data.frame(
RMSE = c(
lm.mllib = rmse(~ tip_amount - prediction, pred_lm_mllib),
lm.h2o = rmse(~ tip.amount - lm, pred.h2o ),
rf.mllib = rmse(~ tip.amount - prediction, pred_rf_mllib),
rf.h2o = rmse(~ tip_amount - rf, pred.h2o),
gbm.mllib = rmse(~ tip_amount - prediction, pred_gbm_mllib),
gbm.h2o = rmse(~ tip.amount - gbm, pred.h2o)
)
)