diff --git a/h2o-dist/buildinfo.json b/h2o-dist/buildinfo.json
index 10aa3d12a37..e54eecf7f1f 100644
--- a/h2o-dist/buildinfo.json
+++ b/h2o-dist/buildinfo.json
@@ -84,11 +84,6 @@
"zip_file_path" : "h2o-SUBST_PROJECT_VERSION-hdp2.6.zip"
},
{
- "distribution" : "mapr3.1",
- "zip_file_name" : "h2o-SUBST_PROJECT_VERSION-mapr3.1.zip",
- "zip_file_path" : "h2o-SUBST_PROJECT_VERSION-mapr3.1.zip"
- },
- {
"distribution" : "mapr4.0",
"zip_file_name" : "h2o-SUBST_PROJECT_VERSION-mapr4.0.zip",
"zip_file_path" : "h2o-SUBST_PROJECT_VERSION-mapr4.0.zip"
diff --git a/h2o-dist/index-old.html b/h2o-dist/index-old.html
index 9da38807ce5..b5c58af1b0e 100644
--- a/h2o-dist/index-old.html
+++ b/h2o-dist/index-old.html
@@ -386,7 +386,6 @@
wget /h2o-SUBST_PROJECT_VERSION-cdh5.4.2.zip
wget /h2o-SUBST_PROJECT_VERSION-hdp2.1.zip
wget /h2o-SUBST_PROJECT_VERSION-hdp2.2.zip
- wget /h2o-SUBST_PROJECT_VERSION-mapr3.1.1.zip
wget /h2o-SUBST_PROJECT_VERSION-mapr4.0.1.zip
2. Unpack the zip file and launch a 6g instance of H2O:
diff --git a/h2o-docs/src/booklets/v2_2015/source/RBooklet.tex b/h2o-docs/src/booklets/v2_2015/source/RBooklet.tex
index a3fc3a9287c..fa1b6c5791c 100644
--- a/h2o-docs/src/booklets/v2_2015/source/RBooklet.tex
+++ b/h2o-docs/src/booklets/v2_2015/source/RBooklet.tex
@@ -15,7 +15,7 @@
\textsc{\Large\bf{Machine Learning with R and H2O}}
\\
\bigskip
-\line(1,0){250} %inserts horizontal line
+\line(1,0){250} %inserts horizontal line
\textsc{\small {Spencer Aiello \hspace{10pt} Eric Eckstrand \hspace{10pt} Anqi Fu}}
@@ -47,7 +47,7 @@
\thispagestyle{empty} %removes page number
-{\raggedright\vfill\
+{\raggedright\vfill\
Machine Learning with R and H2O\\
@@ -59,13 +59,13 @@
2307 Leghorn St. \\
Mountain View, CA 94043\\
\bigskip
-\textcopyright \the\year \hspace{1pt} H2O.ai, Inc. All Rights Reserved.
+\textcopyright \the\year \hspace{1pt} H2O.ai, Inc. All Rights Reserved.
\bigskip
\monthname \hspace{1pt} \the\year: Sixth Edition
\bigskip
-Photos by \textcopyright H2O.ai, Inc.
+Photos by \textcopyright H2O.ai, Inc.
\bigskip
All copyrights belong to their respective owners.\\
@@ -75,7 +75,7 @@
omissions, or for damages resulting from the\\
use of the information contained herein.\\
\bigskip
-Printed in the United States of America.
+Printed in the United States of America.
@@ -91,23 +91,23 @@
\section{Introduction}
-This documentation describes how to use H2O in the R environment. More information on H2O's system and algorithms (as well as R user documentation) is available at the H2O website at {\url{http://docs.h2o.ai}}.
+This documentation describes how to use H2O in the R environment. More information on H2O's system and algorithms (as well as R user documentation) is available at the H2O website at {\url{http://docs.h2o.ai}}.
R uses a REST API to connect to H2O. To use H2O in R or launch H2O from R, specify the IP address and port number of the H2O instance in the R environment . Datasets are not directly transmitted through the REST API. Instead, commands (for example, importing a dataset at specified HDFS location) are sent either through the browser or the REST API to perform the specified task.
-The dataset is then assigned an identifier (the .hex file type in H2O) used as a reference in commands to the web server. After preparing the dataset for modeling by defining significant data and removing insignificant data, H2O creates a model that represents the results of the data analysis. These models are assigned IDs used as references in commands. One of the most popular models for data analysis is GLM.
+The dataset is then assigned an identifier (the .hex file type in H2O) used as a reference in commands to the web server. After preparing the dataset for modeling by defining significant data and removing insignificant data, H2O creates a model that represents the results of the data analysis. These models are assigned IDs used as references in commands. One of the most popular models for data analysis is GLM.
GLM estimates regression models for outcomes following exponential distributions in general. In addition to the Gaussian (i.e. normal) distribution, these include binomial, gamma, Poisson, and Tweedie distributions. Each serves a different purpose, and depending on distribution and link function, can be used for prediction or classification.
-This booklet demonstrates H2O's implementation of GLM in an R environment. For more information on GLM, refer to \textbf{Generalized Linear Modeling with H2O} at {\url{http://h2o.ai/resources/}}.
+This booklet demonstrates H2O's implementation of GLM in an R environment. For more information on GLM, refer to \textbf{Generalized Linear Modeling with H2O} at {\url{http://h2o.ai/resources/}}.
-H2O supports Spark, YARN, and most versions of Hadoop. Hadoop is a scalable open-source file system that uses clusters for distributed storage and dataset processing. Depending on the size of your data, H2O can run on your desktop or scale using multiple nodes with Hadoop, an EC2 cluster, or S3 storage.
+H2O supports Spark, YARN, and most versions of Hadoop. Hadoop is a scalable open-source file system that uses clusters for distributed storage and dataset processing. Depending on the size of your data, H2O can run on your desktop or scale using multiple nodes with Hadoop, an EC2 cluster, or S3 storage.
H2O nodes run as JVM invocations on Hadoop nodes. For performance reasons, we recommend that you do not run an H2O node on the same hardware as the Hadoop NameNode. Because H2O nodes run as mapper tasks in Hadoop, administrators can view them in the normal JobTracker and TaskTracker frameworks, providing process-level (i.e. JVM instance-level) visibility.
-H2O helps R users make the leap from laptop-based processing to large-scale environments. Hadoop lets H2O users scale their data processing capabilities based on their current needs. Using H2O, R, and Hadoop, you can create a complete end-to-end data analysis solution.
+H2O helps R users make the leap from laptop-based processing to large-scale environments. Hadoop lets H2O users scale their data processing capabilities based on their current needs. Using H2O, R, and Hadoop, you can create a complete end-to-end data analysis solution.
-This document describes the four steps of data analysis with H2O:
+This document describes the four steps of data analysis with H2O:
\begin{enumerate}
\item installing H2O
@@ -125,10 +125,10 @@ \section{Installation}
To use H2O with R, start H2O outside of R and connect to it, or launch H2O from R. However, if you launch H2O from R and close the R session, the H2O session closes as well. The H2O session directs R to the datasets and models located in H2O.
-This following sections describe:
+This following sections describe:
\begin{itemize}
-\item installing R
+\item installing R
\item installing H2O from R
%\item making a build from source code
\end{itemize}
@@ -137,10 +137,10 @@ \subsection{Installing R}
To download R:
\begin{enumerate}
-\item Go to \url{http://cran.r-project.org/mirrors.html}.
-\item Select your closest local mirror.
-\item Select your operating system (Linux, OS X, or Windows).
-\item Depending on your OS, download the appropriate file, along with any required packages.
+\item Go to \url{http://cran.r-project.org/mirrors.html}.
+\item Select your closest local mirror.
+\item Select your operating system (Linux, OS X, or Windows).
+\item Depending on your OS, download the appropriate file, along with any required packages.
\item When the download is complete, unzip the file and install. \\
\end{enumerate}
@@ -179,19 +179,19 @@ \subsection{Example Code}
\subsection{Citation}
-To cite this booklet, use the following:
+To cite this booklet, use the following:
Aiello, S., Eckstrand, E., Fu, A., Landry, M., and Aboyoun, P. (\shortmonthname\ \the\year). {\textit{Machine Learning with R and H2O}. {\url{http://h2o.ai/resources/}}.
-%Tom: Do we still want to include this section? If so, can you please make an R example that will call the latest version?
+%Tom: Do we still want to include this section? If so, can you please make an R example that will call the latest version?
-%\subsection{Making a build from the Source Code}
+%\subsection{Making a build from the Source Code}
%If you are a developer who wants to make changes to the R package before building and installing it, pull the source code from Git ({\url{https://github.com/h2oai/h2o-3}}) and follow the instructions in at {\url{https://github.com/h2oai/h2o-3/blob/master/README.md}}.
%TODO
%After making the build, navigate to the top-level \texttt{h2o-3} directory using {\texttt{cd $\mathtt{\sim}$/h2o-3}}, then run the following (replacing the asterisks [*] with the version number) and install.
%\begin{lstlisting}[style=R]
-%./gradlew clean
+%./gradlew clean
%./gradlew build
%$ R CMD INSTALL h2o-r/R/src/contrib/h2o_****.tar.gz
%* installing to library ÔøΩ/Users/H2OUser/.RlibraryÔøΩ
@@ -210,7 +210,7 @@ \subsection{Citation}
\section{H2O Initialization}
-This section describes how to launch H2O:
+This section describes how to launch H2O:
\begin{itemize}
\item from R
\item from the command line
@@ -219,16 +219,16 @@ \section{H2O Initialization}
\subsection{Launching from R} \label{ssec:LaunchR}
-To specify the number of CPUs for the H2O session, use the \texttt{nthreads = } parameter in the \texttt{h2o.init} command. \texttt{-2} uses the CRAN default of 2 CPUs. \texttt{-1} uses
+To specify the number of CPUs for the H2O session, use the \texttt{nthreads = } parameter in the \texttt{h2o.init} command. \texttt{-2} uses the CRAN default of 2 CPUs. \texttt{-1} uses
all CPUs on the host, which is strongly recommended. To use a specific number of CPUs, enter a positive integer.
-To specify the maximum amount of memory for the H2O session, use the {\texttt{max\_mem\_size}} parameter in the \texttt{h2o.init} command. The value must a multiple of 1024 greater than 2MB. Append the letter \texttt{m} or \texttt{M} to indicate megabytes, or \texttt{g} or \texttt{G} to indicate gigabytes.
+To specify the maximum amount of memory for the H2O session, use the {\texttt{max\_mem\_size}} parameter in the \texttt{h2o.init} command. The value must a multiple of 1024 greater than 2MB. Append the letter \texttt{m} or \texttt{M} to indicate megabytes, or \texttt{g} or \texttt{G} to indicate gigabytes.
-If you do not specify a value for {\texttt{max\_mem\_size}} when you run {\texttt{h2o.init}}, the default heap size of the H2O instance running on 32-bit Java is 1g.
+If you do not specify a value for {\texttt{max\_mem\_size}} when you run {\texttt{h2o.init}}, the default heap size of the H2O instance running on 32-bit Java is 1g.
For best performance, the allocated memory should be 4x the size of your data, but never more than the total amount of memory on your computer. For larger datasets, we recommend running on a server or service with more memory available for computing.
-H2O checks the Java version and suggests an upgrade if you are running 32-bit Java. On 64-bit Java, the heap size is 1/4 of the total memory available on the machine.
+H2O checks the Java version and suggests an upgrade if you are running 32-bit Java. On 64-bit Java, the heap size is 1/4 of the total memory available on the machine.
\begin{minipage}{\textwidth}
@@ -238,12 +238,12 @@ \subsection{Launching from R} \label{ssec:LaunchR}
\waterExampleInR
\lstinputlisting[style=R]{R_Vignette_code_examples/r_start_local.R}
-After successfully launching, R displays output similar to the following example:
+After successfully launching, R displays output similar to the following example:
\begin{lstlisting}[style=R]
Successfully connected to http://localhost:54321
R is connected to H2O cluster:
H2O cluster uptime: 11 minutes 35 seconds
- H2O cluster version: 2.7.0.1497
+ H2O cluster version: 2.7.0.1497
H2O cluster name: H2O_started_from_R
H2O cluster total nodes: 1
H2O cluster total memory: 3.56 GB
@@ -309,7 +309,6 @@ \subsection{Launching on Hadoop}
\end{itemize} &
\begin{itemize}
- \item \texttt{mapr3.1.1}
\item \texttt{mapr4.0.1}
\item \texttt{mapr5.0}
\end{itemize}\\
@@ -335,7 +334,7 @@ \subsection{Checking Cluster Status}
\lstinputlisting[style=R]{R_Vignette_code_examples/r_cluster_info.R}
-An easy-to-read summary of information about the cluster displays.
+An easy-to-read summary of information about the cluster displays.
\begin{lstlisting}[style=R]
@@ -353,16 +352,16 @@ \subsection{Checking Cluster Status}
\section{Data Preparation in R}
-The following section contains information about data preparation (also known as data munging) and some of the tools and methods available in H2O, as well as a data training example.
+The following section contains information about data preparation (also known as data munging) and some of the tools and methods available in H2O, as well as a data training example.
\subsection{Notes}
\begin{itemize}
-\item Although it may seem like you are manipulating the data in R, once the data has been passed to H2O, all data munging occurs in the H2O instance. The information is passed to R through JSON APIs, so some functions may not have another method.
-\item You are limited by the total amount of memory allocated to the H2O instance, not by R's ability to handle data. To process large datasets, make sure to allocate enough memory. For more information, refer to {\textbf{\nameref{ssec:LaunchR}}}.
-\item You can manipulate datasets with thousands of factor levels using H2O in R, so if you ask H2O to display a table in R with information from high cardinality factors, the results may overwhelm R`s capacity.
+\item Although it may seem like you are manipulating the data in R, once the data has been passed to H2O, all data munging occurs in the H2O instance. The information is passed to R through JSON APIs, so some functions may not have another method.
+\item You are limited by the total amount of memory allocated to the H2O instance, not by R's ability to handle data. To process large datasets, make sure to allocate enough memory. For more information, refer to {\textbf{\nameref{ssec:LaunchR}}}.
+\item You can manipulate datasets with thousands of factor levels using H2O in R, so if you ask H2O to display a table in R with information from high cardinality factors, the results may overwhelm R`s capacity.
\item To manipulate data in R and not in H2O, use {\texttt{as.data.frame()}}, {\texttt{as.h2o()}}, and {\texttt{str()}}. \begin{itemize}
-\item {\texttt{as.data.frame()}} converts an H2O data frame into an R data frame. If your request exceeds the amount of data supported by R, the R session will crash. If possible, we recommend only taking subsets of the entire dataset (the necessary data columns or rows) instead of the whole dataset.
+\item {\texttt{as.data.frame()}} converts an H2O data frame into an R data frame. If your request exceeds the amount of data supported by R, the R session will crash. If possible, we recommend only taking subsets of the entire dataset (the necessary data columns or rows) instead of the whole dataset.
\item {\texttt{as.h2o()}} transfers data from R to the H2O instance. For successful data transfer, we recommend confirming enough memory is allocated to the H2O instance.
\item {\texttt{str.H2OFrame()}} returns the elements of the new object to confirm that the data transferred correctly. It$'$s a good way to verify there were no data loss or conversion issues. %% is this still supported? can't find it in the R package doc...
%%TODO - replaced with possible new command; confirm (JL)
@@ -373,15 +372,15 @@ \subsection{Notes}
%\subsection{Tools and Methods}
-%The following section describes some of the tools and methods available in H2O for data preparation.
+%The following section describes some of the tools and methods available in H2O for data preparation.
%\begin{itemize}
-%\item {\textbf{Data Profiling}}: Quickly summarize the shape of your dataset to avoid bias or missing information before you start building your model. Missing data, zero values, text, and a visual distribution of the data are visualized automatically upon data ingestion.
-%\item {\textbf{Summary Statistics}}: Visualize your data with summary statistics to get the mean, standard deviation, min, max, or quantile (for numeric columns) or cardinality and counts (for enum columns), and a preview of the dataset.
-%\item {\textbf{Aggregate, Filter, Bin, and Derive Columns}}: Build unique views with Group functions, Filtering, Binning, and Derived Columns.
-%\item {\textbf{Slice, Log Transform, and Anonymize}}: Normalize and partition to get your data into the right shape for modeling, and anonymize to remove confidential information.
-%\item {\textbf{Variable Creation}}: Highly customizable variable value creation to hone in on the key data characteristics to model.
-%\item {\textbf{PCA}}: Principal Component Analysis makes feature selection easy with a simple interface and standard input values to reduce the many dimensions in your dataset into key components.
-%\item {\textbf{Training and Validation Sampling Plan}}: Design a random or stratified sampling plan to generate datasets for model training and scoring.
+%\item {\textbf{Data Profiling}}: Quickly summarize the shape of your dataset to avoid bias or missing information before you start building your model. Missing data, zero values, text, and a visual distribution of the data are visualized automatically upon data ingestion.
+%\item {\textbf{Summary Statistics}}: Visualize your data with summary statistics to get the mean, standard deviation, min, max, or quantile (for numeric columns) or cardinality and counts (for enum columns), and a preview of the dataset.
+%\item {\textbf{Aggregate, Filter, Bin, and Derive Columns}}: Build unique views with Group functions, Filtering, Binning, and Derived Columns.
+%\item {\textbf{Slice, Log Transform, and Anonymize}}: Normalize and partition to get your data into the right shape for modeling, and anonymize to remove confidential information.
+%\item {\textbf{Variable Creation}}: Highly customizable variable value creation to hone in on the key data characteristics to model.
+%\item {\textbf{PCA}}: Principal Component Analysis makes feature selection easy with a simple interface and standard input values to reduce the many dimensions in your dataset into key components.
+%\item {\textbf{Training and Validation Sampling Plan}}: Design a random or stratified sampling plan to generate datasets for model training and scoring.
%\end{itemize}
%\subsection{Demo: Creating Aggregates from Split Data}
@@ -396,12 +395,12 @@ \subsection{Notes}
\section{Models}
%TODO: Another candidate for "common"
-The following section describes the features and functions of some common models available in H2O. For more information about running these models in R using H2O, refer to {\textbf{\nameref{sec:RunModel}}}.
+The following section describes the features and functions of some common models available in H2O. For more information about running these models in R using H2O, refer to {\textbf{\nameref{sec:RunModel}}}.
%%mal H2O supports the following models: Deep Learning (DL), Generalized Linear Models (GLM), Gradient Boosting Machine (GBM), Na\"{i}ve Bayes (NB), Random Forest (RF), K-Means, Principal Components Analysis (PCA) and Generalized Low Rank Model (GLRM).
\begin{minipage}{\textwidth}
-H2O supports the following models:
+H2O supports the following models:
\begin{frame}%no line table for list of 6+ items
@@ -416,7 +415,7 @@ \section{Models}
\end{itemize} &
\begin{itemize}
- \item Generalized Linear Models (GLM)
+ \item Generalized Linear Models (GLM)
\item Gradient Boosted Regression (GBM)
% \item Generalized Low Rank Model (GLRM) Supported?
\item Distributed Random Forest (DRF)
@@ -427,7 +426,7 @@ \section{Models}
\end{frame}
\end{minipage}
-The list is growing quickly, so check \url{www.h2o.ai} to see the latest additions. The following list describes some common model types and features.
+The list is growing quickly, so check \url{www.h2o.ai} to see the latest additions. The following list describes some common model types and features.
\subsection{Supervised Learning}
@@ -450,7 +449,7 @@ \subsection{Unsupervised Learning}
\subsection{Modeling Constructs}
-{\textbf{Grid Search}}: Performs standard hyper-parameter optimization to simplify model configuration.
+{\textbf{Grid Search}}: Performs standard hyper-parameter optimization to simplify model configuration.
After creating a model, use it to make predictions. For more information about predictions, refer to {\textbf{\nameref{ssec:Predictions}}}.
@@ -479,7 +478,7 @@ \section{Data Manipulation in R}
The following section describes some common R commands. For a complete command list, including parameters, refer to {\url{http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Rdoc.html}}.
-For additional help within R's Help tab, precede the command with a question mark (for example, {\texttt{?h2o}}) for suggested commands containing the search terms. For more information on a command, precede the command with two question marks ({\texttt{??h2o}}).
+For additional help within R's Help tab, precede the command with a question mark (for example, {\texttt{?h2o}}) for suggested commands containing the search terms. For more information on a command, precede the command with two question marks ({\texttt{??h2o}}).
\subsection{Importing Files}
@@ -548,9 +547,9 @@ \subsection{Viewing Column Names}
\begin{lstlisting}[style=R]
##Displays the titles of the columns
> colnames(iris.hex)
-[1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
+[1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
> names(iris.hex)
-[1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
+[1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
\end{lstlisting}
\subsection{Getting Minimum and Maximum Values}
@@ -596,19 +595,19 @@ \subsection{Summarizing Data}
\small
\begin{lstlisting}[style=R]
> summary(prostate.hex)
- ID CAPSULE AGE RACE DPROS
- Min. : 1.00 Min. :0.0000 Min. :43.00 Min. :0.000 Min. :1.000
- 1st Qu.: 95.75 1st Qu.:0.0000 1st Qu.:62.00 1st Qu.:1.000 1st Qu.:1.000
- Median :190.50 Median :0.0000 Median :67.00 Median :1.000 Median :2.000
- Mean :190.50 Mean :0.4026 Mean :66.04 Mean :1.087 Mean :2.271
- 3rd Qu.:285.25 3rd Qu.:1.0000 3rd Qu.:71.00 3rd Qu.:1.000 3rd Qu.:3.000
- Max. :380.00 Max. :1.0000 Max. :79.00 Max. :2.000 Max. :4.000
- DCAPS PSA VOL GLEASON
- Min. :1.000 Min. : 0.300 Min. : 0.00 Min. :0.000
- 1st Qu.:1.000 1st Qu.: 4.900 1st Qu.: 0.00 1st Qu.:6.000
- Median :1.000 Median : 8.664 Median :14.20 Median :6.000
- Mean :1.108 Mean : 15.409 Mean :15.81 Mean :6.384
- 3rd Qu.:1.000 3rd Qu.: 17.063 3rd Qu.:26.40 3rd Qu.:7.000
+ ID CAPSULE AGE RACE DPROS
+ Min. : 1.00 Min. :0.0000 Min. :43.00 Min. :0.000 Min. :1.000
+ 1st Qu.: 95.75 1st Qu.:0.0000 1st Qu.:62.00 1st Qu.:1.000 1st Qu.:1.000
+ Median :190.50 Median :0.0000 Median :67.00 Median :1.000 Median :2.000
+ Mean :190.50 Mean :0.4026 Mean :66.04 Mean :1.087 Mean :2.271
+ 3rd Qu.:285.25 3rd Qu.:1.0000 3rd Qu.:71.00 3rd Qu.:1.000 3rd Qu.:3.000
+ Max. :380.00 Max. :1.0000 Max. :79.00 Max. :2.000 Max. :4.000
+ DCAPS PSA VOL GLEASON
+ Min. :1.000 Min. : 0.300 Min. : 0.00 Min. :0.000
+ 1st Qu.:1.000 1st Qu.: 4.900 1st Qu.: 0.00 1st Qu.:6.000
+ Median :1.000 Median : 8.664 Median :14.20 Median :6.000
+ Mean :1.108 Mean : 15.409 Mean :15.81 Mean :6.384
+ 3rd Qu.:1.000 3rd Qu.: 17.063 3rd Qu.:26.40 3rd Qu.:7.000
Max. :2.000 Max. :139.700 Max. :97.60 Max. :9.000
\end{lstlisting}
\normalsize
@@ -667,13 +666,13 @@ \subsection{Generating Random Numbers}
## Creates object for uniform distribution on prostate data set
> s <- h2o.runif(prostate.hex)
> summary (s) ## Summarize the results of h2o.runif
- rnd
- Min. :0.000863
- 1st Qu.:0.239763
- Median :0.507936
- Mean :0.506718
- 3rd Qu.:0.765194
- Max. :0.993178
+ rnd
+ Min. :0.000863
+ 1st Qu.:0.239763
+ Median :0.507936
+ Mean :0.506718
+ 3rd Qu.:0.765194
+ Max. :0.993178
## Create training set with threshold of 0.8
> prostate.train <- prostate.hex[s <= 0.8,]
##Assign name to training set
@@ -688,7 +687,7 @@ \subsection{Generating Random Numbers}
> nrow(prostate.hex) ## Matches the full set
[1] 380
\end{lstlisting}
-
+
\subsection{Splitting Frames}
@@ -716,7 +715,7 @@ \subsection{Getting Frames}
\subsection{Getting Models}
-To create a reference object for the model in H2O, use {\texttt{h2o.getModel()}}. This is helpful for users that alternate between the web UI and the R API or multiple users accessing the same H2O instance.
+To create a reference object for the model in H2O, use {\texttt{h2o.getModel()}}. This is helpful for users that alternate between the web UI and the R API or multiple users accessing the same H2O instance.
In the following example, it is assumed that a GBM with the ID \texttt{GBM\_8e4591a9b413407b983d73fbd9eb44cf} is in the key-value (KV) store.
@@ -739,7 +738,7 @@ \subsection{Listing H2O Objects}
2 GBM_a3ae2edf5dfadbd9ba5dc2e9560c405d 1516
\end{lstlisting}
-\normalsize
+\normalsize
\subsection{Removing H2O Objects}
@@ -789,11 +788,11 @@ \section{Running Models}
\begin{itemize}
\item Gradient Boosting Machine (GBM)
\item Generalized Linear Models (GLM)
-\item K-means
+\item K-means
\item Principal Components Analysis (PCA)
\end{itemize}
-as well as how to generate predictions.
+as well as how to generate predictions.
\subsection{Gradient Boosting Machine (GBM)}
@@ -838,13 +837,13 @@ \subsection{Gradient Boosting Machine (GBM)}
\begin{lstlisting}[style=R]
> iris.gbm2 <- h2o.gbm(y = 5, x = 1:4, training_frame = iris.hex, ntrees = 15, max_depth = 5, min_rows = 2, learn_rate = 0.01, distribution= "multinomial")
-
+
> iris.gbm2@model$training_metrics
H2OMultinomialMetrics: gbm
** Reported on training data. **
-Training Set Metrics:
+Training Set Metrics:
=====================
Extract training frame with `h2o.getFrame("iris.hex")`
@@ -931,7 +930,7 @@ \subsection{K-means}
Model Details:
==============
H2OClusteringModel: kmeans
-Model ID: K-means_model_R_1441989204383_30
+Model ID: K-means_model_R_1441989204383_30
Model Summary:
number_of_rows number_of_clusters number_of_categorical_columns number_of_iterations within_cluster_sum_of_squares
1 150 3 0 8 139.09920
@@ -941,7 +940,7 @@ \subsection{K-means}
** Reported on training data. **
Total Within SS: 139.0992
Between SS: 456.9008
-Total SS: 596
+Total SS: 596
Centroid Statistics:
centroid size within_cluster_sum_of_squares
1 1 44.00000 43.34674
@@ -968,7 +967,7 @@ \subsection{Principal Components Analysis (PCA)}
Model Details:
==============
H2ODimReductionModel: pca
-Model Key: PCA_model_R_1441989204383_36
+Model Key: PCA_model_R_1441989204383_36
Importance of components:
pc1 pc2 pc3
Standard deviation 1.750703 1.512142 1.031181
@@ -982,7 +981,7 @@ \subsection{Principal Components Analysis (PCA)}
\subsection{Predictions}
\label{ssec:Predictions}
-The following section describes some of the prediction methods available in H2O.
+The following section describes some of the prediction methods available in H2O.
{\textbf{Predict}}: Generate outcomes of a dataset with any model. Predict with GLM, GBM, Decision Trees or Deep Learning models.
@@ -994,7 +993,7 @@ \subsection{Predictions}
{\textbf{PCA Score}}: Determine how well your feature selection fits a particular model.
-{\textbf{Multi-Model Scoring}}: Compare and contrast multiple models on a dataset to find the best performer to deploy into production.
+{\textbf{Multi-Model Scoring}}: Compare and contrast multiple models on a dataset to find the best performer to deploy into production.
To apply an H2O model to a holdout set for predictions based on model results, use {\texttt{h2o.predict()}}.
@@ -1024,9 +1023,9 @@ \subsection{Predictions}
10 1 0.46709293 0.5329071
\end{lstlisting}
-\section{Appendix: Commands} \label{Appendix}
+\section{Appendix: Commands} \label{Appendix}
-The following section lists some common R commands by function and a brief description of each command.
+The following section lists some common R commands by function and a brief description of each command.
\subsection {Dataset Operations}
{\bgroup\obeylines %needed for formatting
@@ -1034,143 +1033,143 @@ \section{Appendix: Commands} \label{Appendix}
{\emph{Data Import/Export}}
{\texttt{h2o.downloadCSV}}: Download a H2O dataset to a CSV file on local disk.
{\texttt{h2o.exportFile}}: Export H2O Data Frame to a file.
-{\texttt{h2o.importFile}}: Import a file from the local path and parse it.
-{\texttt{h2o.parseRaw}}: Parse a raw data file.
-{\texttt{h2o.uploadFile}}: Upload a file from the local drive and parse it.
+{\texttt{h2o.importFile}}: Import a file from the local path and parse it.
+{\texttt{h2o.parseRaw}}: Parse a raw data file.
+{\texttt{h2o.uploadFile}}: Upload a file from the local drive and parse it.
\medskip
{\emph{Native R to H2O Coercion}}\smallskip
- {\texttt{as.h2o}}: Convert an R object to an H2O object.
+ {\texttt{as.h2o}}: Convert an R object to an H2O object.
\medskip
\emph{H2O to Native R Coercion}\par
-{\texttt{as.data.frame}}: Check if an object is a data frame, or coerce it if possible.
+{\texttt{as.data.frame}}: Check if an object is a data frame, or coerce it if possible.
\medskip
\emph{Data Generation}\par
-{\texttt{h2o.createFrame}}: Create an H2O data frame, with optional randomization.
-{\texttt{h2o.runif}}: Produce a vector of random uniform numbers.
-{\texttt{h2o.interaction}}: Create interaction terms between categorical features of an H2O Frame.
+{\texttt{h2o.createFrame}}: Create an H2O data frame, with optional randomization.
+{\texttt{h2o.runif}}: Produce a vector of random uniform numbers.
+{\texttt{h2o.interaction}}: Create interaction terms between categorical features of an H2O Frame.
\medskip
\emph{Data Sampling/Splitting}\par
-{\texttt{h2o.splitFrame}}: Split an existing H2O dataset according to user-specified ratios.
+{\texttt{h2o.splitFrame}}: Split an existing H2O dataset according to user-specified ratios.
\emph{Missing Data Handling}\par
-{\texttt{h2o.impute}}: Impute a column of data using the mean, median, or mode.
-{\texttt{h2o.insertMissingValues}}: Replaces a user-specified fraction of entries in a H2O dataset with missing values.
+{\texttt{h2o.impute}}: Impute a column of data using the mean, median, or mode.
+{\texttt{h2o.insertMissingValues}}: Replaces a user-specified fraction of entries in a H2O dataset with missing values.
}
\subsection{General Data Operations}
{\bgroup\obeylines %needed for formatting
-\emph{Subscripting example to pull pieces from data object.}
+\emph{Subscripting example to pull pieces from data object.}
\begin{lstlisting}[style=R]
x[j] ## note: chooses column J, not row J
x[i, j]
x[[i]]
- x$name
+ x$name
x[i] <- value
x[i, j, ...] <- value
x[[i]] <- value
x$i <- value
\end{lstlisting}
-\emph{Subsetting}
+\emph{Subsetting}
\medskip
-{\texttt{head, tail}}: Return the First or Last Part of an Object
+{\texttt{head, tail}}: Return the First or Last Part of an Object
\medskip
\emph{Concatenation}\par
{\texttt{c}}: Combine Values into a Vector or List \\ %%still supported? couldn't find in R doc
{\texttt{h2o.cbind}}: Take a sequence of H2O datasets and combine them by column.\par
\medskip
{\emph{Data Attributes}}\par
-{\texttt{colnames}}: Return column names for a parsed H2O data object.
-{\texttt{colnames$<$-}}: Retrieve or set the row or column names of a matrix-like object.
-{\texttt{names}}: Get the name of an object.
-{\texttt{names$<$-}}: Set the name of an object.
-{\texttt{dim}}: Retrieve the dimension of an object.
-{\texttt{length}}: Get the length of vectors (including lists) and factors.
-{\texttt{nrow}}: Return a count of the number of rows in an H2OParsedData object.
-{\texttt{ncol}}: Return a count of the number of columns in an H2OParsedData object.
-{\texttt{h2o.anyFactor}}: Check if an H2O parsed data object has any categorical data columns.
-{\texttt{is.factor}}: Check if a given column contains categorical data.
+{\texttt{colnames}}: Return column names for a parsed H2O data object.
+{\texttt{colnames$<$-}}: Retrieve or set the row or column names of a matrix-like object.
+{\texttt{names}}: Get the name of an object.
+{\texttt{names$<$-}}: Set the name of an object.
+{\texttt{dim}}: Retrieve the dimension of an object.
+{\texttt{length}}: Get the length of vectors (including lists) and factors.
+{\texttt{nrow}}: Return a count of the number of rows in an H2OParsedData object.
+{\texttt{ncol}}: Return a count of the number of columns in an H2OParsedData object.
+{\texttt{h2o.anyFactor}}: Check if an H2O parsed data object has any categorical data columns.
+{\texttt{is.factor}}: Check if a given column contains categorical data.
\medskip
{\emph{Data Type Coercion}}\par
-{\texttt{as.factor}}: Convert a column from numeric to factor.
-{\texttt{as.Date}}: Converts a column from factor to date.
+{\texttt{as.factor}}: Convert a column from numeric to factor.
+{\texttt{as.Date}}: Converts a column from factor to date.
}
\subsection{Methods from Group Generics}
{\bgroup\obeylines %needed for formatting
\medskip
{\emph{Math (H2O)}}\par
-{\texttt{abs}}: Compute the absolute value of x.
-{\texttt{sign}}: Return a vector with the signs of the corresponding elements of x (the sign of a real number is 1, 0, or -1 if the number is positive, zero, or negative, respectively).
-{\texttt{sqrt}}: Computes the principal square root of x, $\sqrt{x}$.
-{\texttt{ceiling}}: Take a single numeric argument x and return a numeric vector containing the smallest integers not less than the corresponding elements of x.
-{\texttt{floor}}: Take a single numeric argument x and return a numeric vector containing the largest integers not greater than the corresponding elements of x.
-{\texttt{trunc}}: Take a single numeric argument x and return a numeric vector containing the integers formed by truncating the values in x toward 0.
-{\texttt{log}}: Compute logarithms (by default, natural logarithms).
-{\texttt{exp}}: Compute the exponential function.
+{\texttt{abs}}: Compute the absolute value of x.
+{\texttt{sign}}: Return a vector with the signs of the corresponding elements of x (the sign of a real number is 1, 0, or -1 if the number is positive, zero, or negative, respectively).
+{\texttt{sqrt}}: Computes the principal square root of x, $\sqrt{x}$.
+{\texttt{ceiling}}: Take a single numeric argument x and return a numeric vector containing the smallest integers not less than the corresponding elements of x.
+{\texttt{floor}}: Take a single numeric argument x and return a numeric vector containing the largest integers not greater than the corresponding elements of x.
+{\texttt{trunc}}: Take a single numeric argument x and return a numeric vector containing the integers formed by truncating the values in x toward 0.
+{\texttt{log}}: Compute logarithms (by default, natural logarithms).
+{\texttt{exp}}: Compute the exponential function.
\medskip
{\emph{Math (generic)}}\par
-{\texttt{cummax}}: Display a vector of the cumulative maxima of the elements of the argument.
-{\texttt{cummin}}: Display a vector of the cumulative minima of the elements of the argument.
-{\texttt{cumprod}}: Display a vector of the cumulative products of the elements of the argument.
-{\texttt{cumsum}}: Display a vector of the cumulative sums of the elements of the argument.
-{\texttt{log10}}: Compute common (i.e., base 10) logarithms
-{\texttt{log2}}: Compute binary (i.e., base 2) logarithms.
-{\texttt{log1p}}: Compute log(1+x) accurately also for $|x|${}\textless{}\textless{} 1.
-{\texttt{acos}}: Compute the trigonometric arc-cosine.
-{\texttt{acosh}}: Compute the hyperbolic arc-cosine.
-{\texttt{asin}}: Compute the trigonometric arc-sine.
-{\texttt{asinh}}: Compute the hyperbolic arc-sine.
-{\texttt{atan}}: Compute the trigonometric arc-tangent.
-{\texttt{atanh}}: Compute the hyperbolic arc-tangent.
-{\texttt{expm1}}: Compute exp(x) - 1 accurately also for $|x|$\textless{}\textless{} 1.
-{\texttt{cos}}: Compute the trigonometric cosine.
-{\texttt{cosh}}: Compute the hyperbolic cosine.
-{\texttt{cospi}}: Compute the trigonometric two-argument arc-cosine.
-{\texttt{sin}}: Compute the trigonometric sine.
-{\texttt{sinh}}: Compute the hyperbolic sine.
-{\texttt{sinpi}}: Compute the trigonometric two-argument arc-sine.
-{\texttt{tan}}: Compute the trigonometric tangent.
-{\texttt{tanh}}: Compute the hyperbolic tangent.
-{\texttt{tanpi}}: Compute the trigonometric two-argument arc-tangent.
-{\texttt{gamma}}: Display the gamma function $\gamma{x}$
-{\texttt{lgamma}}: Display the natural logarithm of the absolute value of the gamma function.
-{\texttt{digamma}}: Display the first derivative of the logarithm of the gamma function.
-{\texttt{trigamma}}: Display the second derivative of the logarithm of the gamma function.
+{\texttt{cummax}}: Display a vector of the cumulative maxima of the elements of the argument.
+{\texttt{cummin}}: Display a vector of the cumulative minima of the elements of the argument.
+{\texttt{cumprod}}: Display a vector of the cumulative products of the elements of the argument.
+{\texttt{cumsum}}: Display a vector of the cumulative sums of the elements of the argument.
+{\texttt{log10}}: Compute common (i.e., base 10) logarithms
+{\texttt{log2}}: Compute binary (i.e., base 2) logarithms.
+{\texttt{log1p}}: Compute log(1+x) accurately also for $|x|${}\textless{}\textless{} 1.
+{\texttt{acos}}: Compute the trigonometric arc-cosine.
+{\texttt{acosh}}: Compute the hyperbolic arc-cosine.
+{\texttt{asin}}: Compute the trigonometric arc-sine.
+{\texttt{asinh}}: Compute the hyperbolic arc-sine.
+{\texttt{atan}}: Compute the trigonometric arc-tangent.
+{\texttt{atanh}}: Compute the hyperbolic arc-tangent.
+{\texttt{expm1}}: Compute exp(x) - 1 accurately also for $|x|$\textless{}\textless{} 1.
+{\texttt{cos}}: Compute the trigonometric cosine.
+{\texttt{cosh}}: Compute the hyperbolic cosine.
+{\texttt{cospi}}: Compute the trigonometric two-argument arc-cosine.
+{\texttt{sin}}: Compute the trigonometric sine.
+{\texttt{sinh}}: Compute the hyperbolic sine.
+{\texttt{sinpi}}: Compute the trigonometric two-argument arc-sine.
+{\texttt{tan}}: Compute the trigonometric tangent.
+{\texttt{tanh}}: Compute the hyperbolic tangent.
+{\texttt{tanpi}}: Compute the trigonometric two-argument arc-tangent.
+{\texttt{gamma}}: Display the gamma function $\gamma{x}$
+{\texttt{lgamma}}: Display the natural logarithm of the absolute value of the gamma function.
+{\texttt{digamma}}: Display the first derivative of the logarithm of the gamma function.
+{\texttt{trigamma}}: Display the second derivative of the logarithm of the gamma function.
\medskip
\emph{Math2 (H2O)}\par
-{\texttt{round}}: Round the values to the specified number of decimal places. The default is 0.
-{\texttt{signif}}: Round the values to the specified number of significant digits.
+{\texttt{round}}: Round the values to the specified number of decimal places. The default is 0.
+{\texttt{signif}}: Round the values to the specified number of significant digits.
\emph{Summary (H2O)}\par
-{\texttt{max}}: Display the maximum of all the input arguments.
-{\texttt{min}}: Display the minimum of all the input arguments.
-{\texttt{range}}: Display a vector containing the minimum and maximum of all the given arguments.
-{\texttt{sum}}: Calculate the sum of all the values present in its arguments.
+{\texttt{max}}: Display the maximum of all the input arguments.
+{\texttt{min}}: Display the minimum of all the input arguments.
+{\texttt{range}}: Display a vector containing the minimum and maximum of all the given arguments.
+{\texttt{sum}}: Calculate the sum of all the values present in its arguments.
\medskip
\emph{Summary (generic)}\par
-{\texttt{prod}}: Display the product of all values present in its arguments.
-{\texttt{any}}: Given a set of logical vectors, determine if at least one of the values is true.
+{\texttt{prod}}: Display the product of all values present in its arguments.
+{\texttt{any}}: Given a set of logical vectors, determine if at least one of the values is true.
{\texttt{all}}: Given a set of logical vectors, determine if all of the values are true.
}
\subsection{Other Aggregations}
{\bgroup\obeylines %needed for formatting
\medskip
\emph{Non-Group Generic Summaries}\par
-{\texttt{mean}}: Generic function for the (trimmed) arithmetic mean.
-{\texttt{sd}}: Calculate the standard deviation of a column of continuous real valued data.
-{\texttt{var}}: Compute the variance of x.
-{\texttt{summary}}: Produce result summaries of the results of various model fitting functions.
-{\texttt{quantile}}: Obtain and display quantiles for H2O parsed data.
+{\texttt{mean}}: Generic function for the (trimmed) arithmetic mean.
+{\texttt{sd}}: Calculate the standard deviation of a column of continuous real valued data.
+{\texttt{var}}: Compute the variance of x.
+{\texttt{summary}}: Produce result summaries of the results of various model fitting functions.
+{\texttt{quantile}}: Obtain and display quantiles for H2O parsed data.
\medskip
\emph{Row / Column Aggregation}\par
-{\texttt{apply}}: Apply a function over an H2O parsed data object (an array).
+{\texttt{apply}}: Apply a function over an H2O parsed data object (an array).
\medskip
\emph{Group By Aggregation}\par
-%{\texttt{h2o.ddply}}: Split H2O dataset, apply a function, and display results.
-{\texttt{h2o.group\_by}}: Apply an aggregate function to each group of an H2O dataset.
+%{\texttt{h2o.ddply}}: Split H2O dataset, apply a function, and display results.
+{\texttt{h2o.group\_by}}: Apply an aggregate function to each group of an H2O dataset.
\medskip
\emph{Tabulation}\par
@@ -1180,66 +1179,66 @@ \subsection{Data Munging}\par
{\bgroup\obeylines %needed for formatting
\medskip
\emph{General Column Manipulations}\par
-{\texttt{is.na}}: Display missing elements.
+{\texttt{is.na}}: Display missing elements.
\medskip
-\emph{Element Index Selection}\par
-{\texttt{h2o.which}}: Display the row numbers for which the condition is true.
+\emph{Element Index Selection}\par
+{\texttt{h2o.which}}: Display the row numbers for which the condition is true.
\medskip
-\emph{Conditional Element Value Selection}\par
-{\texttt{h2o.ifelse}}: Apply conditional statements to numeric vectors in H2O parsed data objects.
+\emph{Conditional Element Value Selection}\par
+{\texttt{h2o.ifelse}}: Apply conditional statements to numeric vectors in H2O parsed data objects.
\medskip
\emph{Numeric Column Manipulations}\par
-{\texttt{h2o.cut}}: Convert H2O Numeric Data to Factor.
+{\texttt{h2o.cut}}: Convert H2O Numeric Data to Factor.
\medskip
\emph{Character Column Manipulations}\par
-{\texttt{h2o.strsplit}}: Splits the given factor column on the input split.
-{\texttt{h2o.tolower}}: Change the elements of a character vector to lower case.
-{\texttt{h2o.toupper}}: Change the elements of a character vector to lower case.
-{\texttt{h2o.trim}}: Remove leading and trailing white space.
-{\texttt{h2o.gsub}}: Match a pattern \& replace all instances of the matched pattern with the replacement string globally.
-{\texttt{h2o.sub}}: Match a pattern \& replace the first instance of the matched pattern with the replacement string.
+{\texttt{h2o.strsplit}}: Splits the given factor column on the input split.
+{\texttt{h2o.tolower}}: Change the elements of a character vector to lower case.
+{\texttt{h2o.toupper}}: Change the elements of a character vector to lower case.
+{\texttt{h2o.trim}}: Remove leading and trailing white space.
+{\texttt{h2o.gsub}}: Match a pattern \& replace all instances of the matched pattern with the replacement string globally.
+{\texttt{h2o.sub}}: Match a pattern \& replace the first instance of the matched pattern with the replacement string.
\medskip
\emph{Factor Level Manipulations}\par
-{\texttt{h2o.levels}}: Display a list of the unique values found in a column of categorical data.
+{\texttt{h2o.levels}}: Display a list of the unique values found in a column of categorical data.
\medskip
\emph{Date Manipulations}\par
-{\texttt{h2o.month}}: Convert the entries of a H2OParsedData object from milliseconds to months (on a 0 to 11 scale).
-{\texttt{h2o.year}}: Convert the entries of a H2OParsedData object from milliseconds to years, indexed starting from 1900.
+{\texttt{h2o.month}}: Convert the entries of a H2OParsedData object from milliseconds to months (on a 0 to 11 scale).
+{\texttt{h2o.year}}: Convert the entries of a H2OParsedData object from milliseconds to years, indexed starting from 1900.
\emph{Matrix Operations}\par
-{\texttt{\%$*$\%}}}: Multiply two matrices, if they are conformable.
-{\texttt{t}}: Given a matrix or data.frame x, t returns the transpose of x.
+{\texttt{\%$*$\%}}}: Multiply two matrices, if they are conformable.
+{\texttt{t}}: Given a matrix or data.frame x, t returns the transpose of x.
}
\subsection{Data Modeling}
{\bgroup\obeylines %needed for formatting
\medskip
\emph{Model Training: Supervised Learning}\par
-{\texttt{h2o.deeplearning}}: Perform Deep Learning neural networks on an\\ H2OParsedData object.
-{\texttt{h2o.gbm}}: Build gradient boosted classification trees and gradient boosted regression trees on a parsed dataset.
-{\texttt{h2o.glm}}: Fit a generalized linear model, specified by a response variable, a set of predictors, and a description of the error distribution.
-{\texttt{h2o.naiveBayes}}: Build gradient boosted classification trees and gradient boosted regression trees on a parsed dataset.
-{\texttt{h2o.prcomp}}: Perform principal components analysis on the given dataset.
-{\texttt{h2o.randomForest}}: Perform random forest classification on a dataset.
+{\texttt{h2o.deeplearning}}: Perform Deep Learning neural networks on an\\ H2OParsedData object.
+{\texttt{h2o.gbm}}: Build gradient boosted classification trees and gradient boosted regression trees on a parsed dataset.
+{\texttt{h2o.glm}}: Fit a generalized linear model, specified by a response variable, a set of predictors, and a description of the error distribution.
+{\texttt{h2o.naiveBayes}}: Build gradient boosted classification trees and gradient boosted regression trees on a parsed dataset.
+{\texttt{h2o.prcomp}}: Perform principal components analysis on the given dataset.
+{\texttt{h2o.randomForest}}: Perform random forest classification on a dataset.
\medskip
\emph{Model Training: Unsupervised Learning}\par
-{\texttt{h2o.anomaly}}: Detect anomalies in a H2O dataset using a H2O deep learning model with auto-encoding.
-{\texttt{h2o.deepfeatures}}: Extract the non-linear features from a H2O dataset using a H2O deep learning model.
-{\texttt{h2o.kmeans}}: Perform k-means clustering on a dataset.
+{\texttt{h2o.anomaly}}: Detect anomalies in a H2O dataset using a H2O deep learning model with auto-encoding.
+{\texttt{h2o.deepfeatures}}: Extract the non-linear features from a H2O dataset using a H2O deep learning model.
+{\texttt{h2o.kmeans}}: Perform k-means clustering on a dataset.
\medskip
\emph{Grid Search}\par
-{\texttt{h2o.grid}}: Efficient method to build multiple models with different hyperparameters.
+{\texttt{h2o.grid}}: Efficient method to build multiple models with different hyperparameters.
\medskip
\emph{Model Scoring}\par
-{\texttt{h2o.predict}}: Obtain predictions from various fitted H2O model objects.
+{\texttt{h2o.predict}}: Obtain predictions from various fitted H2O model objects.
\medskip
\emph{Model Metrics}\par
@@ -1247,40 +1246,40 @@ \subsection{Data Modeling}
\medskip
\emph{Classification Model Helpers}\par
-{\texttt{h2o.accuracy}}: Get the between cluster sum of squares.
-{\texttt{h2o.auc}}: Retrieve the AUC (area under ROC curve).
-{\texttt{h2o.confusionMatrix}}: Display prediction errors for classification data from a column of predicted responses and a column of actual (reference) responses in H2O.
+{\texttt{h2o.accuracy}}: Get the between cluster sum of squares.
+{\texttt{h2o.auc}}: Retrieve the AUC (area under ROC curve).
+{\texttt{h2o.confusionMatrix}}: Display prediction errors for classification data from a column of predicted responses and a column of actual (reference) responses in H2O.
{\texttt{h2o.hit\_ratio\_table}}: Retrieve the Hit Ratios. If {\texttt{train}}, {\texttt{valid}}, and {\texttt{xval}} parameters are FALSE (default), then the training Hit Ratios value is returned. If more than one parameter is set to TRUE, then a named list
-of Hit Ratio tables are returned, where the names are {\texttt{train}}, {\texttt{valid}}, or {\texttt{xval}}.
-{\texttt{h2o.performance}}: Evaluate the predictive performance of a model via various measures.
+of Hit Ratio tables are returned, where the names are {\texttt{train}}, {\texttt{valid}}, or {\texttt{xval}}.
+{\texttt{h2o.performance}}: Evaluate the predictive performance of a model via various measures.
\medskip
\emph{Regression Model Helper}\par
-{\texttt{h2o.mse}}: Display the mean squared error calculated from a column of predicted responses and a column of actual (reference) responses in H2O.
+{\texttt{h2o.mse}}: Display the mean squared error calculated from a column of predicted responses and a column of actual (reference) responses in H2O.
\medskip
\emph{Clustering Model Helper}\par
-{\texttt{h2o.betweenss}}: Get the between cluster sum of squares.
-{\texttt{h2o.centers}}: Retrieve the Model Centers.
+{\texttt{h2o.betweenss}}: Get the between cluster sum of squares.
+{\texttt{h2o.centers}}: Retrieve the Model Centers.
}
\subsection{H2O Cluster Operations}
{\bgroup\obeylines %needed for formatting
\medskip
\emph{H2O Key Value Store Access}\par
-{\texttt{h2o.assign}}: Assign H2O hex.keys to objects in their R environment.
-{\texttt{h2o.getFrame}}: Get a reference to an existing H2O dataset.
-{\texttt{h2o.getModel}}: Get a reference to an existing H2O model.
-{\texttt{h2o.ls: }}Display a list of object keys in the running instance of H2O.
-{\texttt{h2o.rm}}: Remove H2O objects from the server where the instance of H2O is running, but does not remove it from the R environment.
+{\texttt{h2o.assign}}: Assign H2O hex.keys to objects in their R environment.
+{\texttt{h2o.getFrame}}: Get a reference to an existing H2O dataset.
+{\texttt{h2o.getModel}}: Get a reference to an existing H2O model.
+{\texttt{h2o.ls: }}Display a list of object keys in the running instance of H2O.
+{\texttt{h2o.rm}}: Remove H2O objects from the server where the instance of H2O is running, but does not remove it from the R environment.
\medskip
\emph{H2O Object Serialization}\par
-{\texttt{h2o.loadModel}}: Load an H2OModel object from disk.
-{\texttt{h2o.saveModel}}: Save an H2OModel object to disk to be loaded back into H2O using {\texttt{h2o.loadModel}}.
+{\texttt{h2o.loadModel}}: Load an H2OModel object from disk.
+{\texttt{h2o.saveModel}}: Save an H2OModel object to disk to be loaded back into H2O using {\texttt{h2o.loadModel}}.
-\emph{H2O Cluster Connection}\par
-{\texttt{h2o.init (nthreads = -1)}}: Connect to a running H2O instance using all CPUs on the host and check the local H2O R package is the correct version.
-{\texttt{h2o.shutdown}}: Shut down the specified H2O instance. All data on the server will be lost!
+\emph{H2O Cluster Connection}\par
+{\texttt{h2o.init (nthreads = -1)}}: Connect to a running H2O instance using all CPUs on the host and check the local H2O R package is the correct version.
+{\texttt{h2o.shutdown}}: Shut down the specified H2O instance. All data on the server will be lost!
\medskip
\emph{H2O Load Balancing}\par
@@ -1288,27 +1287,27 @@ \subsection{H2O Cluster Operations}
\medskip
\emph{H2O Cluster Information}\par
-{\texttt{h2o.clusterInfo}}: Display the name, version, uptime, total nodes, total memory, total cores and health of a cluster running H2O.
-{\texttt{h2o.clusterStatus}}: Retrieve information on the status of the cluster running H2O.
+{\texttt{h2o.clusterInfo}}: Display the name, version, uptime, total nodes, total memory, total cores and health of a cluster running H2O.
+{\texttt{h2o.clusterStatus}}: Retrieve information on the status of the cluster running H2O.
\medskip
\emph{H2O Logging}\par
-{\texttt{h2o.clearLog}}: Clear all H2O R command and error response logs from the local disk.
-{\texttt{h2o.downloadAllLogs}}: Download all H2O log files to the local disk.
-{\texttt{h2o.logAndEcho}}: Write a message to the H2O Java log file and echo it back.
-{\texttt{h2o.openLog}}: Open existing logs of H2O R POST commands and error responses on the local disk.
-{\texttt{h2o.getLogPath}}: Get the file path for the H2O R command and error response logs.
-{\texttt{h2o.startLogging}}: Begin logging H2O R POST commands and error responses.
-{\texttt{h2o.stopLogging}}: Stop logging H2O R POST commands and error responses.
+{\texttt{h2o.clearLog}}: Clear all H2O R command and error response logs from the local disk.
+{\texttt{h2o.downloadAllLogs}}: Download all H2O log files to the local disk.
+{\texttt{h2o.logAndEcho}}: Write a message to the H2O Java log file and echo it back.
+{\texttt{h2o.openLog}}: Open existing logs of H2O R POST commands and error responses on the local disk.
+{\texttt{h2o.getLogPath}}: Get the file path for the H2O R command and error response logs.
+{\texttt{h2o.startLogging}}: Begin logging H2O R POST commands and error responses.
+{\texttt{h2o.stopLogging}}: Stop logging H2O R POST commands and error responses.
\medskip
\emph{H2O String Manipulation}\par
-{\texttt{h2o.gsub}}: String global substitution (all occurrences).
-{\texttt{h2o.strsplit}}: String Split.
-{\texttt{h2o.sub}}: String substitution (first occurrence).
-{\texttt{h2o.tolower}}: Convert characters to lower case.
-{\texttt{h2o.toupper}}: Convert characters to upper case.
-{\texttt{h2o.trim}}: Trim spaces.
+{\texttt{h2o.gsub}}: String global substitution (all occurrences).
+{\texttt{h2o.strsplit}}: String Split.
+{\texttt{h2o.sub}}: String substitution (first occurrence).
+{\texttt{h2o.tolower}}: Convert characters to lower case.
+{\texttt{h2o.toupper}}: Convert characters to upper case.
+{\texttt{h2o.trim}}: Trim spaces.
}
%\egroup %needed for formatting
@@ -1326,7 +1325,7 @@ \section{Authors}
\textbf{Eric Eckstrand}
-Eric is a Quality and Performance Hacker at H2O.ai. Eric has formal education in computer science and systems engineering. Prior to joining H2O, Eric was a submariner in the US Navy. His roles included Reactor Controls Assistant and Communications Officer (USS Pittsburgh SSN-720) and Submarine Operations Officer (Destroyer Squadron One).
+Eric is a Quality and Performance Hacker at H2O.ai. Eric has formal education in computer science and systems engineering. Prior to joining H2O, Eric was a submariner in the US Navy. His roles included Reactor Controls Assistant and Communications Officer (USS Pittsburgh SSN-720) and Submarine Operations Officer (Destroyer Squadron One).
\textbf{Anqi Fu}
@@ -1348,7 +1347,7 @@ \section{Authors}
\newpage
\section{References}
-\bibliographystyle{plainnat}
+\bibliographystyle{plainnat}
\nobibliography{bibliography}
diff --git a/h2o-docs/src/product/hadoop.rst b/h2o-docs/src/product/hadoop.rst
index 9ef4b2555fa..47fecd7de14 100644
--- a/h2o-docs/src/product/hadoop.rst
+++ b/h2o-docs/src/product/hadoop.rst
@@ -18,7 +18,6 @@ Currently supported versions:
- HDP 2.4
- HDP 2.5
- HDP 2.6
-- MapR 3.1
- MapR 4.0
- MapR 5.0
- MapR 5.1
@@ -40,7 +39,7 @@ Prerequisite: Open Communication Paths
--------------------------------------
H2O communicates using two communication paths. Verify these are open
-and available for use by H2O.
+and available for use by H2O.
**Path 1: mapper to driver**
@@ -125,7 +124,7 @@ and the parameters involved in launching H2O from the command line.
H2O node 172.16.2.184:54321 requested flatfile
Sending flatfiles to nodes...
[Sending flatfile to node 172.16.2.184:54321]
- H2O node 172.16.2.184:54321 reports H2O cluster size 1
+ H2O node 172.16.2.184:54321 reports H2O cluster size 1
H2O cluster (1 nodes) is up
Blocking until the H2O cluster shuts down...
@@ -216,7 +215,7 @@ Then import the data with the S3 URL path:
::
- importFiles [ "s3n:/path/to/bucket/file/file.tab.gz" ]
+ importFiles [ "s3n:/path/to/bucket/file/file.tab.gz" ]
- To import the data from the R API:
diff --git a/h2o-docs/src/product/howto/FAQ.md b/h2o-docs/src/product/howto/FAQ.md
index d896e4969cd..f5d423b4d12 100644
--- a/h2o-docs/src/product/howto/FAQ.md
+++ b/h2o-docs/src/product/howto/FAQ.md
@@ -3,32 +3,32 @@
##General Troubleshooting Tips
-- Confirm your internet connection is active.
+- Confirm your internet connection is active.
- Test connectivity using curl: First, log in to the first node and enter `curl http://:54321` (where `` is the IP address of the second node. Then, log in to the second node and enter `curl http://:54321` (where `` is the IP address of the first node). Look for output from H2O.
- Try allocating more memory to H2O by modifying the `-Xmx` value when launching H2O from the command line (for example, `java -Xmx10g -jar h2o.jar` allocates 10g of memory for H2O). If you create a cluster with four 20g nodes (by specifying `-Xmx20g` four times), H2O will have a total of 80 gigs of memory available. For best performance, we recommend sizing your cluster to be about four times the size of your data. To avoid swapping, the `-Xmx` allocation must not exceed the physical memory on any node. Allocating the same amount of memory for all nodes is strongly recommended, as H2O works best with symmetric nodes.
-- Confirm that no other sessions of H2O are running. To stop all running H2O sessions, enter `ps -efww | grep h2o` in your shell (OSX or Linux).
+- Confirm that no other sessions of H2O are running. To stop all running H2O sessions, enter `ps -efww | grep h2o` in your shell (OSX or Linux).
- Confirm ports 54321 and 54322 are available for both TCP and UDP. Launch Telnet (for Windows users) or Terminal (for OS X users), then type `telnet localhost 54321`, `telnet localhost 54322`
-- Confirm your firewall is not preventing the nodes from locating each other. If you can't launch H2O, we recommend temporarily disabling any firewalls until you can confirm they are not preventing H2O from launching.
+- Confirm your firewall is not preventing the nodes from locating each other. If you can't launch H2O, we recommend temporarily disabling any firewalls until you can confirm they are not preventing H2O from launching.
- Confirm the nodes are not using different versions of H2O. If the H2O initialization is not successful, look at the output in the shell - if you see `Attempting to join /localhost:54321 with an H2O version mismatch (md5 differs)`, update H2O on all the nodes to the same version.
-- Confirm that there is space in the `/tmp` directory.
- - Windows: In Command Prompt, enter `TEMP` and `%TEMP%` and delete files as needed, or use Disk Cleanup.
- - OS X: In Terminal, enter `open $TMPDIR` and delete the folder with your username.
+- Confirm that there is space in the `/tmp` directory.
+ - Windows: In Command Prompt, enter `TEMP` and `%TEMP%` and delete files as needed, or use Disk Cleanup.
+ - OS X: In Terminal, enter `open $TMPDIR` and delete the folder with your username.
- Confirm that the username is the same on all nodes; if not, define the cloud in the terminal when launching using `-name`:`java -jar h2o.jar -name myCloud`.
-- Confirm that there are no spaces in the file path name used to launch H2O.
-- Confirm that the nodes are not on different networks by confirming that the IP addresses of the nodes are the same in the output:
+- Confirm that there are no spaces in the file path name used to launch H2O.
+- Confirm that the nodes are not on different networks by confirming that the IP addresses of the nodes are the same in the output:
```
INFO: Listening for HTTP and REST traffic on IP_Address/
-06-18 10:54:21.586 192.168.1.70:54323 25638 main
+06-18 10:54:21.586 192.168.1.70:54323 25638 main
INFO: H2O cloud name: 'H2O_User' on IP_Address, discovery address /Discovery_Address
INFO: Cloud of size 1 formed [IP_Address]
```
- Check if the nodes have different interfaces; if so, use the -network option to define the network (for example, `-network 127.0.0.1`). To use a network range, use a comma to separate the IP addresses (for example, `-network 123.45.67.0/22,123.45.68.0/24`).
- Force the bind address using `-ip`:`java -jar h2o.jar -ip -port `.
- (Hadoop only) Try launching H2O with a longer timeout: `hadoop jar h2odriver.jar -timeout 1800`
-- (Hadoop only) Try to launch H2O using more memory: `hadoop jar h2odriver.jar -mapperXmx 10g`. The cluster’s memory capacity is the sum of all H2O nodes in the cluster.
+- (Hadoop only) Try to launch H2O using more memory: `hadoop jar h2odriver.jar -mapperXmx 10g`. The cluster’s memory capacity is the sum of all H2O nodes in the cluster.
- (Linux only) Check if you have SELINUX or IPTABLES enabled; if so, disable them.
- (EC2 only) Check the configuration for the EC2 security group.
@@ -53,7 +53,7 @@ Exception in thread "main" java.lang.UnsupportedClassVersionError: water/H2OApp
at java.lang.ClassLoader.loadClass(Unknown Source)
Could not find the main class: water.H2OApp. Program will exit.
```
-This error output indicates that your Java version is not supported. Upgrade to [Java 7 (JVM)](http://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html) or [later](http://www.oracle.com/technetwork/java/javase/downloads/jre8-downloads-2133155.html) and H2O should launch successfully.
+This error output indicates that your Java version is not supported. Upgrade to [Java 7 (JVM)](http://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html) or [later](http://www.oracle.com/technetwork/java/javase/downloads/jre8-downloads-2133155.html) and H2O should launch successfully.
---
@@ -68,22 +68,22 @@ This error output indicates that your Java version is not supported. Upgrade to
**What does it mean if the r2 value in my model is negative?**
-The coefficient of determination (also known as r^2) can be negative if:
+The coefficient of determination (also known as r^2) can be negative if:
- linear regression is used without an intercept (constant)
- non-linear functions are fitted to the data
- predictions compared to the corresponding outcomes are not based on the model-fitting procedure using those data
- it is early in the build process (may self-correct as more trees are added)
-If your r2 value is negative after your model is complete, your model is likely incorrect. Make sure your data is suitable for the type of model, then try adding an intercept.
+If your r2 value is negative after your model is complete, your model is likely incorrect. Make sure your data is suitable for the type of model, then try adding an intercept.
---
**What's the process for implementing new algorithms in H2O?**
-This [blog post](http://h2o.ai/blog/2014/16/Hacking/Algos/) by Cliff walks you through building a new algorithm, using K-Means, Quantiles, and Grep as examples.
+This [blog post](http://h2o.ai/blog/2014/16/Hacking/Algos/) by Cliff walks you through building a new algorithm, using K-Means, Quantiles, and Grep as examples.
-To learn more about performance characteristics when implementing new algorithms, refer to Cliff's [KV Store Guide](http://0xdata.com/blog/2014/05/kv-store-memory-analytics-part-2-2/).
+To learn more about performance characteristics when implementing new algorithms, refer to Cliff's [KV Store Guide](http://0xdata.com/blog/2014/05/kv-store-memory-analytics-part-2-2/).
---
@@ -94,16 +94,16 @@ P-values are currently supported for non-regularized GLM. The following requirem
- The family cannot be multinomial
- The lambda value must be equal to zero
- The IRLSM solver must be used
-- Lambda search cannot be used
+- Lambda search cannot be used
To generate p-values, do one of the following:
- check the *compute_p_values* checkbox in the GLM model builder in Flow
- use `compute_p_values=TRUE` in R or Python while creating the model
-The p-values are listed in the coefficients table (as shown in the following example screenshot):
+The p-values are listed in the coefficients table (as shown in the following example screenshot):
- 
+ 
---
@@ -111,60 +111,60 @@ The p-values are listed in the coefficients table (as shown in the following exa
**How do I specify regression or classification for Distributed Random Forest in the web UI?**
-If the response column is numeric, H2O generates a regression model. If the response column is enum, the model uses classification. To specify the column type, select it from the drop-down column name list in the **Edit Column Names and Types** section during parsing.
+If the response column is numeric, H2O generates a regression model. If the response column is enum, the model uses classification. To specify the column type, select it from the drop-down column name list in the **Edit Column Names and Types** section during parsing.
---
**What's the largest number of classes that H2O supports for multinomial prediction?**
-For tree-based algorithms, the maximum number of classes (or levels) for a response column is 1000.
+For tree-based algorithms, the maximum number of classes (or levels) for a response column is 1000.
---
**How do I obtain a tree diagram of my DRF model?**
-Output the SVG code for the edges and nodes. A simple tree visitor is available [here](https://github.com/h2oai/h2o-3/blob/master/h2o-algos/src/main/java/hex/tree/TreeVisitor.java) and the Java code generator is available [here](https://github.com/h2oai/h2o-3/blob/master/h2o-algos/src/main/java/hex/tree/TreeJCodeGen.java).
+Output the SVG code for the edges and nodes. A simple tree visitor is available [here](https://github.com/h2oai/h2o-3/blob/master/h2o-algos/src/main/java/hex/tree/TreeVisitor.java) and the Java code generator is available [here](https://github.com/h2oai/h2o-3/blob/master/h2o-algos/src/main/java/hex/tree/TreeJCodeGen.java).
---
**Is Word2Vec available? I can see the Java and R sources, but calling the API generates an error.**
-Word2Vec, along with other natural language processing (NLP) algos, are currently in development in the current version of H2O.
+Word2Vec, along with other natural language processing (NLP) algos, are currently in development in the current version of H2O.
---
**What are the "best practices" for preparing data for a K-Means model?**
-There aren't specific "best practices," as it depends on your data and the column types. However, removing outliers and transforming any categorical columns to have the same weight as the numeric columns will help, especially if you're standardizing your data.
+There aren't specific "best practices," as it depends on your data and the column types. However, removing outliers and transforming any categorical columns to have the same weight as the numeric columns will help, especially if you're standardizing your data.
---
**What is your implementation of Deep Learning based on?**
- Our Deep Learning algorithm is based on the feedforward neural net. For more information, refer to our Data Science documentation or [Wikipedia](https://en.wikipedia.org/wiki/Feedforward_neural_network).
+ Our Deep Learning algorithm is based on the feedforward neural net. For more information, refer to our Data Science documentation or [Wikipedia](https://en.wikipedia.org/wiki/Feedforward_neural_network).
---
**How is deviance computed for a Deep Learning regression model?**
-For a Deep Learning regression model, deviance is computed as follows:
+For a Deep Learning regression model, deviance is computed as follows:
Loss = MeanSquare -> MSE==Deviance
-For Absolute/Laplace or Huber -> MSE != Deviance.
+For Absolute/Laplace or Huber -> MSE != Deviance.
---
**For my 0-tree GBM multinomial model, I got a different score depending on whether or not validation was enabled, even though my dataset was the same - why is that?**
-Different results may be generated because of the way H2O computes the initial MSE.
+Different results may be generated because of the way H2O computes the initial MSE.
---
**How does your Deep Learning Autoencoder work? Is it deep or shallow?**
-H2O’s DL autoencoder is based on the standard deep (multi-layer) neural net architecture, where the entire network is learned together, instead of being stacked layer-by-layer. The only difference is that no response is required in the input and that the output layer has as many neurons as the input layer. If you don’t achieve convergence, then try using the *Tanh* activation and fewer layers. We have some example test scripts [here](https://github.com/h2oai/h2o-3/blob/master/h2o-r/tests/testdir_algos/deeplearning/), and even some that show [how stacked auto-encoders can be implemented in R](https://github.com/h2oai/h2o-3/blob/master/h2o-r/tests/testdir_algos/deeplearning/runit_deeplearning_stacked_autoencoder_large.R).
+H2O’s DL autoencoder is based on the standard deep (multi-layer) neural net architecture, where the entire network is learned together, instead of being stacked layer-by-layer. The only difference is that no response is required in the input and that the output layer has as many neurons as the input layer. If you don’t achieve convergence, then try using the *Tanh* activation and fewer layers. We have some example test scripts [here](https://github.com/h2oai/h2o-3/blob/master/h2o-r/tests/testdir_algos/deeplearning/), and even some that show [how stacked auto-encoders can be implemented in R](https://github.com/h2oai/h2o-3/blob/master/h2o-r/tests/testdir_algos/deeplearning/runit_deeplearning_stacked_autoencoder_large.R).
@@ -173,52 +173,52 @@ H2O’s DL autoencoder is based on the standard deep (multi-layer) neural net ar
**Are there any H2O examples using text for classification?**
-Currently, the following examples are available for Sparkling Water:
+Currently, the following examples are available for Sparkling Water:
-a) Use TF-IDF weighting scheme for classifying text messages
-https://github.com/h2oai/sparkling-water/blob/master/examples/scripts/mlconf_2015_hamSpam.script.scala
+a) Use TF-IDF weighting scheme for classifying text messages
+https://github.com/h2oai/sparkling-water/blob/master/examples/scripts/mlconf_2015_hamSpam.script.scala
-b) Use Word2Vec Skip-gram model + GBM for classifying job titles
-https://github.com/h2oai/sparkling-water/blob/master/examples/scripts/craigslistJobTitles.scala
+b) Use Word2Vec Skip-gram model + GBM for classifying job titles
+https://github.com/h2oai/sparkling-water/blob/master/examples/scripts/craigslistJobTitles.scala
---
**Most machine learning tools cannot predict with a new categorical level that was not included in the training set. How does H2O make predictions in this scenario?**
-Here is an example of how the prediction process works in H2O:
+Here is an example of how the prediction process works in H2O:
0. Train a model using data that has a categorical predictor column with levels B,C, and D (no other levels); this level will be the "training set domain": {B,C,D}
0. During scoring, the test set has only rows with levels A,C, and E for that column; this is the "test set domain": {A,C,E}
-0. For scoring, a combined "scoring domain" is created, which is the training domain appended with the extra test set domain entries: {B,C,D,A,E}
-0. Each model can handle these extra levels {A,E} separately during scoring.
+0. For scoring, a combined "scoring domain" is created, which is the training domain appended with the extra test set domain entries: {B,C,D,A,E}
+0. Each model can handle these extra levels {A,E} separately during scoring.
-The behavior for unseen categorical levels depends on the algorithm and how it handles missing levels (NA values):
+The behavior for unseen categorical levels depends on the algorithm and how it handles missing levels (NA values):
- For DRF and GBM, missing values are interpreted as containing information (i.e., missing for a reason), rather than missing at random. During tree building, split decisions for every node are found by minimizing the loss function and treating missing values as a separate category that can go either left or right.
-- Deep Learning creates an extra input neuron for missing and unseen categorical levels, which can remain untrained if there were no missing or unseen categorical levels in the training data, resulting in a random contribution to the next layer during testing.
-- GLM skips unseen levels in the beta*x dot product.
+- Deep Learning creates an extra input neuron for missing and unseen categorical levels, which can remain untrained if there were no missing or unseen categorical levels in the training data, resulting in a random contribution to the next layer during testing.
+- GLM skips unseen levels in the beta*x dot product.
---
**How are quantiles computed?**
-The quantile results in Flow are computed lazily on-demand and cached. It is a fast approximation (max - min / 1024) that is very accurate for most use cases.
-If the distribution is skewed, the quantile results may not be as accurate as the results obtained using `h2o.quantile` in R or `H2OFrame.quantile` in Python.
+The quantile results in Flow are computed lazily on-demand and cached. It is a fast approximation (max - min / 1024) that is very accurate for most use cases.
+If the distribution is skewed, the quantile results may not be as accurate as the results obtained using `h2o.quantile` in R or `H2OFrame.quantile` in Python.
---
**How do I create a classification model? The model always defaults to regression.**
-To create a classification model, the response column type must be `enum` - if the response is `numeric`, a regression model is created.
+To create a classification model, the response column type must be `enum` - if the response is `numeric`, a regression model is created.
-To convert the response column:
+To convert the response column:
- Before parsing, click the drop-down menu to the right of the column name or number and select `Enum`

- or
+ or
- Click on the .hex link for the data frame (or use the `getFrameSummary ".hex"` command, where `` is the name of the frame), then click the **Convert to enum** link to the right of the column name or number
@@ -236,14 +236,14 @@ To convert the response column:
```
Error: Missing name at classes.R:19
In addition: Warning messages:
-1: @S3method is deprecated. Please use @export instead
-2: @S3method is deprecated. Please use @export instead
+1: @S3method is deprecated. Please use @export instead
+2: @S3method is deprecated. Please use @export instead
Execution halted
```
-To build H2O, [Roxygen2](https://cran.r-project.org/web/packages/roxygen2/vignettes/roxygen2.html) version 4.1.1 is required.
-
-To update your Roxygen2 version, install the `versions` package in R, then use `install.versions("roxygen2", "4.1.1")`.
+To build H2O, [Roxygen2](https://cran.r-project.org/web/packages/roxygen2/vignettes/roxygen2.html) version 4.1.1 is required.
+
+To update your Roxygen2 version, install the `versions` package in R, then use `install.versions("roxygen2", "4.1.1")`.
@@ -251,13 +251,13 @@ To update your Roxygen2 version, install the `versions` package in R, then use `
**Using `./gradlew build` doesn't generate a build successfully - is there anything I can do to troubleshoot?**
-Use `./gradlew clean` before running `./gradlew build`.
+Use `./gradlew clean` before running `./gradlew build`.
---
**I tried using `./gradlew build` after using `git pull` to update my local H2O repo, but now I can't get H2O to build successfully - what should I do?**
-Try using `./gradlew build -x test` - the build may be failing tests if data is not synced correctly.
+Try using `./gradlew build -x test` - the build may be failing tests if data is not synced correctly.
---
@@ -266,15 +266,15 @@ Try using `./gradlew build -x test` - the build may be failing tests if data is
**When trying to launch H2O, I received the following error message: `ERROR: Too many retries starting cloud.` What should I do?**
-If you are trying to start a multi-node cluster where the nodes use multiple network interfaces, by default H2O will resort to using the default host (127.0.0.1).
+If you are trying to start a multi-node cluster where the nodes use multiple network interfaces, by default H2O will resort to using the default host (127.0.0.1).
-To specify an IP address, launch H2O using the following command:
+To specify an IP address, launch H2O using the following command:
`java -jar h2o.jar -ip -port `
-If this does not resolve the issue, try the following additional troubleshooting tips:
+If this does not resolve the issue, try the following additional troubleshooting tips:
-- Confirm your internet connection is active.
+- Confirm your internet connection is active.
- Test connectivity using curl: First, log in to the first node and enter curl http://:54321 (where is the IP address of the second node. Then, log in to the second node and enter curl http://:54321 (where is the IP address of the first node). Look for output from H2O.
@@ -292,36 +292,36 @@ If this does not resolve the issue, try the following additional troubleshooting
**What should I do if I tried to start a cluster but the nodes started independent clouds that are not connected?**
-Because the default cloud name is the user name of the node, if the nodes are on different operating systems (for example, one node is using Windows and the other uses OS X), the different user names on each machine will prevent the nodes from recognizing that they belong to the same cloud. To resolve this issue, use `-name` to configure the same name for all nodes.
+Because the default cloud name is the user name of the node, if the nodes are on different operating systems (for example, one node is using Windows and the other uses OS X), the different user names on each machine will prevent the nodes from recognizing that they belong to the same cloud. To resolve this issue, use `-name` to configure the same name for all nodes.
---
**One of the nodes in my cluster is unavailable — what do I do?**
-H2O does not support high availability (HA). If a node in the cluster is unavailable, bring the cluster down and create a new healthy cluster.
+H2O does not support high availability (HA). If a node in the cluster is unavailable, bring the cluster down and create a new healthy cluster.
---
**How do I add new nodes to an existing cluster?**
-New nodes can only be added if H2O has not started any jobs. Once H2O starts a task, it locks the cluster to prevent new nodes from joining. If H2O has started a job, you must create a new cluster to include additional nodes.
+New nodes can only be added if H2O has not started any jobs. Once H2O starts a task, it locks the cluster to prevent new nodes from joining. If H2O has started a job, you must create a new cluster to include additional nodes.
---
**How do I check if all the nodes in the cluster are healthy and communicating?**
-In the Flow web UI, click the **Admin** menu and select **Cluster Status**.
+In the Flow web UI, click the **Admin** menu and select **Cluster Status**.
---
**How do I create a cluster behind a firewall?**
-H2O uses two ports:
+H2O uses two ports:
-- The `REST_API` port (54321): Specify when launching H2O using `-port`; uses TCP only.
-- The `INTERNAL_COMMUNICATION` port (54322): Implied based on the port specified as the `REST_API` port, +1; requires TCP and UDP.
+- The `REST_API` port (54321): Specify when launching H2O using `-port`; uses TCP only.
+- The `INTERNAL_COMMUNICATION` port (54322): Implied based on the port specified as the `REST_API` port, +1; requires TCP and UDP.
-You can start the cluster behind the firewall, but to reach it, you must make a tunnel to reach the `REST_API` port. To use the cluster, the `REST_API` port of at least one node must be reachable.
+You can start the cluster behind the firewall, but to reach it, you must make a tunnel to reach the `REST_API` port. To use the cluster, the `REST_API` port of at least one node must be reachable.
---
@@ -354,21 +354,21 @@ To avoid using 127.0.0.1 on servers with multiple local IP addresses, run the co
**How does the timeline tool work?**
-The timeline is a debugging tool that provides information on the current communication between H2O nodes. It shows a snapshot of the most recent messages passed between the nodes. Each node retains its own history of messages sent to or received from other nodes.
+The timeline is a debugging tool that provides information on the current communication between H2O nodes. It shows a snapshot of the most recent messages passed between the nodes. Each node retains its own history of messages sent to or received from other nodes.
-H2O collects these messages from all the nodes and orders them by whether they were sent or received. Each node has an implicit internal order where sent messages must precede received messages on the other node.
+H2O collects these messages from all the nodes and orders them by whether they were sent or received. Each node has an implicit internal order where sent messages must precede received messages on the other node.
-The following information displays for each message:
+The following information displays for each message:
- `HH:MM:SS:MS` and `nanosec`: The local time of the event
- `Who`: The endpoint of the message; can be either a source/receiver node or source node and multicast for broadcasted messages
- `I/O Type`: The type of communication (either UDP for small messages or TCP for large messages)
- >**Note**: UDP messages are only sent if the UDP option was enabled when launching H2O or for multicast when a flatfile is not used for configuration.
+ >**Note**: UDP messages are only sent if the UDP option was enabled when launching H2O or for multicast when a flatfile is not used for configuration.
- `Event`: The type of H2O message. The most common type is a distributed task, which displays as `exec` (the requested task) -> `ack` (results of the processed task) -> `ackck` (sender acknowledges receiving the response, task is completed and removed)
-- `rebooted`: Sent during node startup
+- `rebooted`: Sent during node startup
- `heartbeat`: Provides small message tracking information about node health, exchanged periodically between nodes
- `fetchack`: Aknowledgement of the `Fetch` type task, which retrieves the ID of a previously unseen type
-- `bytes`: Information extracted from the message, including the type of the task and the unique task number
+- `bytes`: Information extracted from the message, including the type of the task and the unique task number
@@ -379,18 +379,18 @@ The following information displays for each message:
**How should I format my SVMLight data before importing?**
-The data must be formatted as a sorted list of unique integers, the column indices must be >= 1, and the columns must be in ascending order.
+The data must be formatted as a sorted list of unique integers, the column indices must be >= 1, and the columns must be in ascending order.
---
**What date and time formats does H2O support?**
-H2O is set to auto-detect two major date/time formats. Because many date time formats are ambiguous (e.g. 01/02/03), general date time detection is not used.
+H2O is set to auto-detect two major date/time formats. Because many date time formats are ambiguous (e.g. 01/02/03), general date time detection is not used.
-The first format is for dates formatted as yyyy-MM-dd. Year is a four-digit number, the month is a two-digit number ranging from 1 to 12, and the day is a two-digit value ranging from 1 to 31. This format can also be followed by a space and then a time (specified below).
+The first format is for dates formatted as yyyy-MM-dd. Year is a four-digit number, the month is a two-digit number ranging from 1 to 12, and the day is a two-digit value ranging from 1 to 31. This format can also be followed by a space and then a time (specified below).
-The second date format is for dates formatted as dd-MMM-yy. Here the day must be one or two digits with a value ranging from 1 to 31. The month must be either a three-letter abbreviation or the full month name but is not case sensitive. The year must be either two or four digits. In agreement with [POSIX](https://en.wikipedia.org/wiki/POSIX) standards, two-digit dates >= 69 are assumed to be in the 20th century (e.g. 1969) and the rest are part of the 21st century. This date format can be followed by either a space or colon character and then a time. The '-' between the values is optional.
+The second date format is for dates formatted as dd-MMM-yy. Here the day must be one or two digits with a value ranging from 1 to 31. The month must be either a three-letter abbreviation or the full month name but is not case sensitive. The year must be either two or four digits. In agreement with [POSIX](https://en.wikipedia.org/wiki/POSIX) standards, two-digit dates >= 69 are assumed to be in the 20th century (e.g. 1969) and the rest are part of the 21st century. This date format can be followed by either a space or colon character and then a time. The '-' between the values is optional.
Times are specified as HH:mm:ss. HH is a two-digit hour and must be a value between 0-23 (for 24-hour time) or 1-12 (for a twelve-hour clock). mm is a two-digit minute value and must be a value between 0-59. ss is a two-digit second value and must be a value between 0-59. This format can be followed with either milliseconds, nanoseconds, and/or the cycle (i.e. AM/PM). If milliseconds are included, the format is HH:mm:ss:SSS. If nanoseconds are included, the format is HH:mm:ss:SSSnnnnnn. H2O only stores fractions of a second up to the millisecond, so accuracy may be lost. Nanosecond parsing is only included for convenience. Finally, a valid time can end with a space character and then either "AM" or "PM". For this format, the hours must range from 1 to 12. Within the time, the ':' character can be replaced with a '.' character.
@@ -398,13 +398,13 @@ Times are specified as HH:mm:ss. HH is a two-digit hour and must be a value betw
**How does H2O handle name collisions/conflicts in the dataset?**
-If there is a name conflict (for example, column 48 isn't named, but C48 already exists), then the column name in concatenated to itself until a unique name is created. So for the previously cited example, H2O will try renaming the column to C48C48, then C48C48C48, and so on until an unused name is generated.
+If there is a name conflict (for example, column 48 isn't named, but C48 already exists), then the column name in concatenated to itself until a unique name is created. So for the previously cited example, H2O will try renaming the column to C48C48, then C48C48C48, and so on until an unused name is generated.
---
**What types of data columns does H2O support?**
-Currently, H2O supports:
+Currently, H2O supports:
- float (any IEEE double)
- integer (up to 64bit, but compressed according to actual range)
@@ -417,7 +417,7 @@ Currently, H2O supports:
**I am trying to parse a Gzip data file containing multiple files, but it does not parse as quickly as the uncompressed files. Why is this?**
-Parsing Gzip files is not done in parallel, so it is sequential and uses only one core. Other parallel parse compression schemes are on the roadmap.
+Parsing Gzip files is not done in parallel, so it is sequential and uses only one core. Other parallel parse compression schemes are on the roadmap.
@@ -428,20 +428,20 @@ Parsing Gzip files is not done in parallel, so it is sequential and uses only on
**How do I score using an exported JSON model?**
-Since JSON is just a representation format, it cannot be directly executed, so a JSON export can't be used for scoring. However, you can score by:
+Since JSON is just a representation format, it cannot be directly executed, so a JSON export can't be used for scoring. However, you can score by:
-- including the POJO in your execution stream and handing it observations one at a time
+- including the POJO in your execution stream and handing it observations one at a time
or
-- handing your data in bulk to an H2O cluster, which will score using high throughput parallel and distributed bulk scoring.
+- handing your data in bulk to an H2O cluster, which will score using high throughput parallel and distributed bulk scoring.
---
**How do I score using an exported POJO?**
-The generated POJO can be used indepedently of a H2O cluster. First use `curl` to send the h2o-genmodel.jar file and the java code for model to the server. The following is an example; the ip address and model names will need to be changed.
+The generated POJO can be used indepedently of a H2O cluster. First use `curl` to send the h2o-genmodel.jar file and the java code for model to the server. The following is an example; the ip address and model names will need to be changed.
```
mkdir tmpdir
@@ -456,14 +456,14 @@ To score a simple .CSV file, download the [PredictCSV.java](https://raw.githubus
wget https://raw.githubusercontent.com/h2oai/h2o-3/master/h2o-r/tests/testdir_javapredict/PredictCSV.java
mkdir gbm_model_dir
javac -cp h2o-genmodel.jar -J-Xmx2g -J-XX:MaxPermSize=128m PredictCSV.java gbm_model.java -d gbm_model_dir
-```
+```
Specify the following:
-- the classpath using `-cp`
-- the model name (or class) using `--model`
-- the csv file you want to score using `--input`
-- the location for the predictions using `--output`.
-
+- the classpath using `-cp`
+- the model name (or class) using `--model`
+- the csv file you want to score using `--input`
+- the location for the predictions using `--output`.
+
You must match the table column names to the order specified in the POJO. The output file will be in a .hex format, which is a lossless text representation of floating point numbers. Both R and Java will be able to read the hex strings as numerics.
```
@@ -474,13 +474,13 @@ java -ea -cp h2o-genmodel.jar:gbm_model_dir -Xmx4g -XX:MaxPermSize=256m -XX:Rese
**How do I predict using multiple response variables?**
-Currently, H2O does not support multiple response variables. To predict different response variables, build multiple models.
+Currently, H2O does not support multiple response variables. To predict different response variables, build multiple models.
---
**How do I kill any running instances of H2O?**
-In Terminal, enter `ps -efww | grep h2o`, then kill any running PIDs. You can also find the running instance in Terminal and press **Ctrl + C** on your keyboard. To confirm no H2O sessions are still running, go to `http://localhost:54321` and verify that the H2O web UI does not display.
+In Terminal, enter `ps -efww | grep h2o`, then kill any running PIDs. You can also find the running instance in Terminal and press **Ctrl + C** on your keyboard. To confirm no H2O sessions are still running, go to `http://localhost:54321` and verify that the H2O web UI does not display.
---
@@ -502,7 +502,7 @@ In Terminal, enter `ps -efww | grep h2o`, then kill any running PIDs. You can al
The only prerequisite for running H2O is a compatible version of Java. We recommend Oracle's [Java 1.7](http://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html).
-
+
---
**Why did I receive the following error when I tried to launch H2O?**
@@ -572,13 +572,13 @@ Killed.
[root@sandbox h2o-dev-0.3.0.1188-hdp2.2]#
```
-The H2O launch failed because more memory was requested than was available. Make sure you are not trying to specify more memory in the launch parameters than you have available.
+The H2O launch failed because more memory was requested than was available. Make sure you are not trying to specify more memory in the launch parameters than you have available.
---
**How does the architecture of H2O work?**
-This [PDF](https://github.com/h2oai/h2o-meetups/blob/master/2014_11_18_H2O_in_Big_Data_Environments/H2OinBigDataEnvironments.pdf) includes diagrams and slides depicting how H2O works in big data environments.
+This [PDF](https://github.com/h2oai/h2o-meetups/blob/master/2014_11_18_H2O_in_Big_Data_Environments/H2OinBigDataEnvironments.pdf) includes diagrams and slides depicting how H2O works in big data environments.
---
@@ -588,12 +588,12 @@ This [PDF](https://github.com/h2oai/h2o-meetups/blob/master/2014_11_18_H2O_in_Bi
Invalid flow_dir illegal character at index 12...
```
-This error message means that there is a space (or other unsupported character) in your H2O directory. To resolve this error:
+This error message means that there is a space (or other unsupported character) in your H2O directory. To resolve this error:
-- Create a new folder without unsupported characters to use as the H2O directory (for example, `C:\h2o`).
+- Create a new folder without unsupported characters to use as the H2O directory (for example, `C:\h2o`).
+
+ or
- or
-
- Specify a different save directory using the `-flow_dir` parameter when launching H2O: `java -jar h2o.jar -flow_dir test`
---
@@ -607,23 +607,23 @@ This error message means that there is a space (or other unsupported character)
**Does H2O support GPUs?**
-Currently, we do not support this capability. If you are interested in contributing your efforts to support this feature to our open-source code database, please contact us at [h2ostream@googlegroups.com](mailto:h2ostream@googlegroups.com).
+Currently, we do not support this capability. If you are interested in contributing your efforts to support this feature to our open-source code database, please contact us at [h2ostream@googlegroups.com](mailto:h2ostream@googlegroups.com).
---
**How can I continue working on a model in H2O after restarting?**
-There are a number of ways you can save your model in H2O:
+There are a number of ways you can save your model in H2O:
-- In the web UI, click the **Flow** menu then click **Save Flow**. Your flow is saved to the *Flows* tab in the **Help** sidebar on the right.
-- In the web UI, click the **Flow** menu then click **Download this Flow...**. Depending on your browser and configuration, your flow is saved to the "Downloads" folder (by default) or to the location you specify in the pop-up **Save As** window if it appears.
-- (For DRF, GBM, and DL models only): Use model checkpointing to resume training a model. Copy the `model_id` number from a built model and paste it into the *checkpoint* field in the `buildModel` cell.
+- In the web UI, click the **Flow** menu then click **Save Flow**. Your flow is saved to the *Flows* tab in the **Help** sidebar on the right.
+- In the web UI, click the **Flow** menu then click **Download this Flow...**. Depending on your browser and configuration, your flow is saved to the "Downloads" folder (by default) or to the location you specify in the pop-up **Save As** window if it appears.
+- (For DRF, GBM, and DL models only): Use model checkpointing to resume training a model. Copy the `model_id` number from a built model and paste it into the *checkpoint* field in the `buildModel` cell.
---
-**How can I find out more about H2O's real-time, nano-fast scoring engine?**
+**How can I find out more about H2O's real-time, nano-fast scoring engine?**
H2O's scoring engine uses a Plain Old Java Object (POJO). The POJO code runs quickly but is single-threaded. It is intended for embedding into lightweight real-time environments.
@@ -637,24 +637,24 @@ In-H2O scoring is triggered on an existing H2O cluster, typically using a REST A
**I am using an older version of H2O (2.8 or prior) - where can I find documentation for this version?**
-If you are using H2O 2.8 or prior, we strongly recommend upgrading to the latest version of H2O if possible.
+If you are using H2O 2.8 or prior, we strongly recommend upgrading to the latest version of H2O if possible.
-If you do not wish to upgrade to the latest version, documentation for H2O Classic is available [here](http://docs.h2o.ai/h2oclassic/index.html).
+If you do not wish to upgrade to the latest version, documentation for H2O Classic is available [here](http://docs.h2o.ai/h2oclassic/index.html).
---
**I am writing an academic research paper and I would like to cite H2O in my bibliography - how should I do that?**
-To cite our software:
+To cite our software:
-- The H2O.ai Team. (2015) *h2o: R Interface for H2O*. R package version 3.1.0.99999. http://www.h2o.ai.
+- The H2O.ai Team. (2015) *h2o: R Interface for H2O*. R package version 3.1.0.99999. http://www.h2o.ai.
-- The H2O.ai Team. (2015) *h2o: h2o: Python Interface for H2O*. Python package version 3.1.0.99999. http://www.h2o.ai.
+- The H2O.ai Team. (2015) *h2o: h2o: Python Interface for H2O*. Python package version 3.1.0.99999. http://www.h2o.ai.
-- - The H2O.ai Team. (2015) *H2O: Scalable Machine Learning*. Version 3.1.0.99999. http://www.h2o.ai.
+- - The H2O.ai Team. (2015) *H2O: Scalable Machine Learning*. Version 3.1.0.99999. http://www.h2o.ai.
-To cite one of our booklets:
+To cite one of our booklets:
- Nykodym, T., Hussami, N., Kraljevic, T.,Rao, A., and Wang, A. (Sept. 2015). *Generalized Linear Modeling with H2O.* http://h2o.ai/resources.
@@ -664,12 +664,12 @@ To cite one of our booklets:
- Aiello, S., Eckstrand, E., Fu, A., Landry, M., and Aboyoun, P. (Sept. 2015) *Fast Scalable R with H2O.* http://h2o.ai/resources.
-- Aiello, S., Click, C., Roark, H. and Rehak, L. (Sept. 2015) *Machine Learning with Python and H2O* http://h2o.ai/resources.
+- Aiello, S., Click, C., Roark, H. and Rehak, L. (Sept. 2015) *Machine Learning with Python and H2O* http://h2o.ai/resources.
-- Malohlava, M., and Tellez, A. (Sept. 2015) *Machine Learning with Sparkling Water: H2O + Spark* http://h2o.ai/resources.
+- Malohlava, M., and Tellez, A. (Sept. 2015) *Machine Learning with Sparkling Water: H2O + Spark* http://h2o.ai/resources.
-If you are using Bibtex:
+If you are using Bibtex:
```
@@ -737,7 +737,7 @@ If you are using Bibtex:
**How can I use Flow to export the prediction results with a dataset?**
-After obtaining your results, click the **Combine predictions with frame** button, then click the **View Frame** button.
+After obtaining your results, click the **Combine predictions with frame** button, then click the **View Frame** button.
@@ -757,7 +757,7 @@ R's GC is now used to remove unused R temps, and when the last use of a shared c
In other words:
Don't delete RTMPs, they'll disappear at the next R GC.
Don't worry about copies (they aren't getting made).
-Do Nothing and All Is Well.
+Do Nothing and All Is Well.
---
@@ -783,7 +783,7 @@ h2o.saveModel(model, dir = model_path, name = “mymodel")
**How do I specify which nodes should run H2O in a Hadoop cluster?**
-After creating and applying the desired node labels and associating them with specific queues as described in the [Hadoop documentation](http://docs.hortonworks.com/HDPDocuments/HDP2/HDP-2.2.0/YARN_RM_v22/node_labels/index.html#Item1.1), launch H2O using the following command:
+After creating and applying the desired node labels and associating them with specific queues as described in the [Hadoop documentation](http://docs.hortonworks.com/HDPDocuments/HDP2/HDP-2.2.0/YARN_RM_v22/node_labels/index.html#Item1.1), launch H2O using the following command:
`hadoop jar h2odriver.jar -Dmapreduce.job.queuename= -nodes -mapperXmx 6g -output hdfsOutputDirName`
@@ -803,13 +803,13 @@ After creating and applying the desired node labels and associating them with sp
**How do I import data from HDFS in R and in Flow?**
-To import from HDFS in R:
+To import from HDFS in R:
```
h2o.importFolder(path, pattern = "", destination_frame = "", parse = TRUE, header = NA, sep = "", col.names = NULL, na.strings = NULL)
```
-Here is another example:
+Here is another example:
```
# pathToAirlines <- "hdfs://mr-0xd6.0xdata.loc/datasets/airlines_all.csv"
@@ -817,11 +817,11 @@ Here is another example:
```
-In Flow, the easiest way is to let the auto-suggestion feature in the *Search:* field complete the path for you. Just start typing the path to the file, starting with the top-level directory, and H2O provides a list of matching files.
+In Flow, the easiest way is to let the auto-suggestion feature in the *Search:* field complete the path for you. Just start typing the path to the file, starting with the top-level directory, and H2O provides a list of matching files.

-
-Click the file to add it to the *Search:* field.
+
+Click the file to add it to the *Search:* field.
---
@@ -844,10 +844,9 @@ Each h2odriver.jar file is built with a specific Hadoop distribution so in order
wget http://h2o-release.s3.amazonaws.com/h2o/master/{{build_number}}/h2o-{{project_version}}-hdp2.1.zip
wget http://h2o-release.s3.amazonaws.com/h2o/master/{{build_number}}/h2o-{{project_version}}-hdp2.2.zip
wget http://h2o-release.s3.amazonaws.com/h2o/master/{{build_number}}/h2o-{{project_version}}-hdp2.3.zip
- wget http://h2o-release.s3.amazonaws.com/h2o/master/{{build_number}}/h2o-{{project_version}}-mapr3.1.1.zip
wget http://h2o-release.s3.amazonaws.com/h2o/master/{{build_number}}/h2o-{{project_version}}-mapr4.0.1.zip
wget http://h2o-release.s3.amazonaws.com/h2o/master/{{build_number}}/h2o-{{project_version}}-mapr5.0.zip
-
+
**Note**: Enter only one of the above commands.
@@ -863,44 +862,44 @@ Then run the command to launch the H2O Application in the driver by specifying t
**How do I use H2O with Java?**
-There are two ways to use H2O with Java. The simplest way is to call the REST API from your Java program to a remote cluster and should meet the needs of most users.
+There are two ways to use H2O with Java. The simplest way is to call the REST API from your Java program to a remote cluster and should meet the needs of most users.
-You can access the REST API documentation within Flow, or on our [documentation site](http://h2o-release.s3.amazonaws.com/h2o/{{branch_name}}/{{build_number}}/docs-website/h2o-docs/index.html#route-reference).
+You can access the REST API documentation within Flow, or on our [documentation site](http://h2o-release.s3.amazonaws.com/h2o/{{branch_name}}/{{build_number}}/docs-website/h2o-docs/index.html#route-reference).
-Flow, Python, and R all rely on the REST API to run H2O. For example, each action in Flow translates into one or more REST API calls. The script fragments in the cells in Flow are essentially the payloads for the REST API calls. Most R and Python API calls translate into a single REST API call.
+Flow, Python, and R all rely on the REST API to run H2O. For example, each action in Flow translates into one or more REST API calls. The script fragments in the cells in Flow are essentially the payloads for the REST API calls. Most R and Python API calls translate into a single REST API call.
-To see how the REST API is used with H2O:
+To see how the REST API is used with H2O:
-- Using Chrome as your internet browser, open the developer tab while viewing the web UI. As you perform tasks, review the network calls made by Flow.
+- Using Chrome as your internet browser, open the developer tab while viewing the web UI. As you perform tasks, review the network calls made by Flow.
-- Write an R program for H2O using the H2O R package that uses `h2o.startLogging()` at the beginning. All REST API calls used are logged.
+- Write an R program for H2O using the H2O R package that uses `h2o.startLogging()` at the beginning. All REST API calls used are logged.
-The second way to use H2O with Java is to embed H2O within your Java application, similar to [Sparkling Water](https://github.com/h2oai/sparkling-water/blob/master/DEVEL.md).
+The second way to use H2O with Java is to embed H2O within your Java application, similar to [Sparkling Water](https://github.com/h2oai/sparkling-water/blob/master/DEVEL.md).
---
**How do I communicate with a remote cluster using the REST API?**
-To create a set of bare POJOs for the REST API payloads that can be used by JVM REST API clients:
+To create a set of bare POJOs for the REST API payloads that can be used by JVM REST API clients:
-0. Clone the sources from GitHub.
-0. Start an H2O instance.
+0. Clone the sources from GitHub.
+0. Start an H2O instance.
0. Enter `% cd py`.
-0. Enter `% python generate_java_binding.py`.
+0. Enter `% python generate_java_binding.py`.
-This script connects to the server, gets all the metadata for the REST API schemas, and writes the Java POJOs to `{sourcehome}/build/bindings/Java`.
+This script connects to the server, gets all the metadata for the REST API schemas, and writes the Java POJOs to `{sourcehome}/build/bindings/Java`.
---
**I keep getting a message that I need to install Java. I have the latest version of Java installed, but I am still getting this message. What should I do?**
-This error message displays if the `JAVA_HOME` environment variable is not set correctly. The `JAVA_HOME` variable is likely points to Apple Java version 6 instead of Oracle Java version 8.
+This error message displays if the `JAVA_HOME` environment variable is not set correctly. The `JAVA_HOME` variable is likely points to Apple Java version 6 instead of Oracle Java version 8.
-If you are running OS X 10.7 or earlier, enter the following in Terminal:
+If you are running OS X 10.7 or earlier, enter the following in Terminal:
`export JAVA_HOME=/Library/Internet\ Plug-Ins/JavaAppletPlugin.plugin/Contents/Home`
-If you are running OS X 10.8 or later, modify the launchd.plist by entering the following in Terminal:
+If you are running OS X 10.8 or later, modify the launchd.plist by entering the following in Terminal:
```
cat << EOF | sudo tee /Library/LaunchDaemons/setenv.JAVA_HOME.plist
@@ -934,7 +933,7 @@ EOF
**I tried to install H2O in Python but `pip install scikit-learn` failed - what should I do?**
-Use the following commands (prepending with `sudo` if necessary):
+Use the following commands (prepending with `sudo` if necessary):
```
easy_install pip
@@ -944,13 +943,13 @@ pip install scipy
pip install scikit-learn
```
-If you are still encountering errors and you are using OSX, the default version of Python may be installed. We recommend installing the Homebrew version of Python instead:
+If you are still encountering errors and you are using OSX, the default version of Python may be installed. We recommend installing the Homebrew version of Python instead:
```
brew install python
```
-If you are encountering errors related to missing Python packages when using H2O, refer to the following list for a complete list of all Python packages, including dependencies:
+If you are encountering errors related to missing Python packages when using H2O, refer to the following list for a complete list of all Python packages, including dependencies:
@@ -980,84 +979,84 @@ If you are encountering errors related to missing Python packages when using H2O
**How do I specify a value as an enum in Python? Is there a Python equivalent of `as.factor()` in R?**
-Use `.asfactor()` to specify a value as an enum.
+Use `.asfactor()` to specify a value as an enum.
---
**I received the following error when I tried to install H2O using the Python instructions on the downloads page - what should I do to resolve it?**
```
-Downloading/unpacking http://h2o-release.s3.amazonaws.com/h2o/rel-shannon/12/Python/h2o-3.0.0.12-py2.py3-none-any.whl
- Downloading h2o-3.0.0.12-py2.py3-none-any.whl (43.1Mb): 43.1Mb downloaded
- Running setup.py egg_info for package from http://h2o-release.s3.amazonaws.com/h2o/rel-shannon/12/Python/h2o-3.0.0.12-py2.py3-none-any.whl
- Traceback (most recent call last):
- File "", line 14, in
- IOError: [Errno 2] No such file or directory: '/tmp/pip-nTu3HK-build/setup.py'
- Complete output from command python setup.py egg_info:
- Traceback (most recent call last):
+Downloading/unpacking http://h2o-release.s3.amazonaws.com/h2o/rel-shannon/12/Python/h2o-3.0.0.12-py2.py3-none-any.whl
+ Downloading h2o-3.0.0.12-py2.py3-none-any.whl (43.1Mb): 43.1Mb downloaded
+ Running setup.py egg_info for package from http://h2o-release.s3.amazonaws.com/h2o/rel-shannon/12/Python/h2o-3.0.0.12-py2.py3-none-any.whl
+ Traceback (most recent call last):
+ File "", line 14, in
+ IOError: [Errno 2] No such file or directory: '/tmp/pip-nTu3HK-build/setup.py'
+ Complete output from command python setup.py egg_info:
+ Traceback (most recent call last):
- File "", line 14, in
+ File "", line 14, in
-IOError: [Errno 2] No such file or directory: '/tmp/pip-nTu3HK-build/setup.py'
+IOError: [Errno 2] No such file or directory: '/tmp/pip-nTu3HK-build/setup.py'
----
+---
Command python setup.py egg_info failed with error code 1 in /tmp/pip-nTu3HK-build
```
-With Python, there is no automatic update of installed packages, so you must upgrade manually. Additionally, the package distribution method recently changed from `distutils` to `wheel`. The following procedure should be tried first if you are having trouble installing the H2O package, particularly if error messages related to `bdist_wheel` or `eggs` display.
+With Python, there is no automatic update of installed packages, so you must upgrade manually. Additionally, the package distribution method recently changed from `distutils` to `wheel`. The following procedure should be tried first if you are having trouble installing the H2O package, particularly if error messages related to `bdist_wheel` or `eggs` display.
```
-# this gets the latest setuptools
-# see https://pip.pypa.io/en/latest/installing.html
-wget https://bootstrap.pypa.io/ez_setup.py -O - | sudo python
+# this gets the latest setuptools
+# see https://pip.pypa.io/en/latest/installing.html
+wget https://bootstrap.pypa.io/ez_setup.py -O - | sudo python
-# platform dependent ways of installing pip are at
-# https://pip.pypa.io/en/latest/installing.html
-# but the above should work on most linux platforms?
+# platform dependent ways of installing pip are at
+# https://pip.pypa.io/en/latest/installing.html
+# but the above should work on most linux platforms?
-# on ubuntu
-# if you already have some version of pip, you can skip this.
-sudo apt-get install python-pip
+# on ubuntu
+# if you already have some version of pip, you can skip this.
+sudo apt-get install python-pip
-# the package manager doesn't install the latest. upgrade to latest
-# we're not using easy_install any more, so don't care about checking that
-pip install pip --upgrade
+# the package manager doesn't install the latest. upgrade to latest
+# we're not using easy_install any more, so don't care about checking that
+pip install pip --upgrade
-# I've seen pip not install to the final version ..i.e. it goes to an almost
-# final version first, then another upgrade gets it to the final version.
-# We'll cover that, and also double check the install.
+# I've seen pip not install to the final version ..i.e. it goes to an almost
+# final version first, then another upgrade gets it to the final version.
+# We'll cover that, and also double check the install.
-# after upgrading pip, the path name may change from /usr/bin to /usr/local/bin
-# start a new shell, just to make sure you see any path changes
+# after upgrading pip, the path name may change from /usr/bin to /usr/local/bin
+# start a new shell, just to make sure you see any path changes
-bash
+bash
-# Also: I like double checking that the install is bulletproof by reinstalling.
-# Sometimes it seems like things say they are installed, but have errors during the install. Check for no errors or stack traces.
+# Also: I like double checking that the install is bulletproof by reinstalling.
+# Sometimes it seems like things say they are installed, but have errors during the install. Check for no errors or stack traces.
-pip install pip --upgrade --force-reinstall
+pip install pip --upgrade --force-reinstall
-# distribute should be at the most recent now. Just in case
-# don't do --force-reinstall here, it causes an issue.
+# distribute should be at the most recent now. Just in case
+# don't do --force-reinstall here, it causes an issue.
-pip install distribute --upgrade
+pip install distribute --upgrade
-# Now check the versions
-pip list | egrep '(distribute|pip|setuptools)'
-distribute (0.7.3)
-pip (7.0.3)
-setuptools (17.0)
+# Now check the versions
+pip list | egrep '(distribute|pip|setuptools)'
+distribute (0.7.3)
+pip (7.0.3)
+setuptools (17.0)
-# Re-install wheel
-pip install wheel --upgrade --force-reinstall
+# Re-install wheel
+pip install wheel --upgrade --force-reinstall
```
-After completing this procedure, go to Python and use `h2o.init()` to start H2O in Python.
+After completing this procedure, go to Python and use `h2o.init()` to start H2O in Python.
->**Note**:
+>**Note**:
>
>If you use gradlew to build the jar yourself, you have to start the jar >yourself before you do `h2o.init()`.
>
@@ -1067,13 +1066,13 @@ After completing this procedure, go to Python and use `h2o.init()` to start H2O
**How should I specify the datatype during import in Python?**
-Refer to the following example:
+Refer to the following example:
```
#Let's say you want to change the second column "CAPSULE" of prostate.csv
#to categorical. You have 3 options.
-#Option 1. Use a dictionary of column names to types.
+#Option 1. Use a dictionary of column names to types.
fr = h2o.import_file("smalldata/logreg/prostate.csv", col_types = {"CAPSULE":"Enum"})
fr.describe()
@@ -1085,9 +1084,9 @@ fr.describe()
#Option 3. Use parse_setup().
fraw = h2o.import_file("smalldata/logreg/prostate.csv", parse = False)
-fsetup = h2o.parse_setup(fraw)
+fsetup = h2o.parse_setup(fraw)
fsetup["column_types"][1] = '"Enum"'
-fr = h2o.parse_raw(fsetup)
+fr = h2o.parse_raw(fsetup)
fr.describe()
```
@@ -1108,11 +1107,11 @@ Out[26]:
**What is PySparkling? How can I use it for grid search or early stopping?**
-PySparkling basically calls H2O Python functions for all operations on H2O data frames. You can perform all H2O Python operations available in H2O Python version 3.6.0.3 or later from PySparkling.
+PySparkling basically calls H2O Python functions for all operations on H2O data frames. You can perform all H2O Python operations available in H2O Python version 3.6.0.3 or later from PySparkling.
For help on a function within IPython Notebook, run `H2OGridSearch?`
-Here is an example of grid search in PySparkling:
+Here is an example of grid search in PySparkling:
```
from h2o.grid.grid_search import H2OGridSearch
@@ -1152,7 +1151,7 @@ model_grid.train(x=x, y=y, distribution="multinomial", epochs=1000, training_fra
**Do you have a tutorial for grid search in Python?**
-Yes, a notebook is available [here](https://github.com/h2oai/h2o-3/blob/master/h2o-py/demos/H2O_tutorial_eeg_eyestate.ipynb) that demonstrates the use of grid search in Python.
+Yes, a notebook is available [here](https://github.com/h2oai/h2o-3/blob/master/h2o-py/demos/H2O_tutorial_eeg_eyestate.ipynb) that demonstrates the use of grid search in Python.
@@ -1163,7 +1162,7 @@ Yes, a notebook is available [here](https://github.com/h2oai/h2o-3/blob/master/h
**Which versions of R are compatible with H2O?**
-Currently, the only version of R that is known to not work well with H2O is R version 3.1.0 (codename "Spring Dance"). If you are using this version, we recommend upgrading the R version before using H2O.
+Currently, the only version of R that is known to not work well with H2O is R version 3.1.0 (codename "Spring Dance"). If you are using this version, we recommend upgrading the R version before using H2O.
@@ -1171,7 +1170,7 @@ Currently, the only version of R that is known to not work well with H2O is R ve
**What R packages are required to use H2O?**
-The following packages are required:
+The following packages are required:
- `methods`
- `statmod`
@@ -1182,9 +1181,9 @@ The following packages are required:
- `tools`
- `utils`
-Some of these packages have dependencies; for example, `bitops` is required, but it is a dependency of the `RCurl` package, so `bitops` is automatically included when `RCurl` is installed.
+Some of these packages have dependencies; for example, `bitops` is required, but it is a dependency of the `RCurl` package, so `bitops` is automatically included when `RCurl` is installed.
-If you are encountering errors related to missing R packages when using H2O, refer to the following list for a complete list of all R packages, including dependencies:
+If you are encountering errors related to missing R packages when using H2O, refer to the following list for a complete list of all R packages, including dependencies:
@@ -1244,32 +1243,32 @@ If you are encountering errors related to missing R packages when using H2O, ref
**How can I install the H2O R package if I am having permissions problems?**
-This issue typically occurs for Linux users when the R software was installed by a root user. For more information, refer to the following [link](https://stat.ethz.ch/R-manual/R-devel/library/base/html/libPaths.html).
+This issue typically occurs for Linux users when the R software was installed by a root user. For more information, refer to the following [link](https://stat.ethz.ch/R-manual/R-devel/library/base/html/libPaths.html).
To specify the installation location for the R packages, create a file that contains the `R_LIBS_USER` environment variable:
`echo R_LIBS_USER=\"~/.Rlibrary\" > ~/.Renviron`
-Confirm the file was created successfully using `cat`:
+Confirm the file was created successfully using `cat`:
`$ cat ~/.Renviron`
You should see the following output:
-
+
`R_LIBS_USER="~/.Rlibrary"`
Create a new directory for the environment variable:
`$ mkdir ~/.Rlibrary`
-Start R and enter the following:
+Start R and enter the following:
`.libPaths()`
-Look for the following output to confirm the changes:
+Look for the following output to confirm the changes:
```
-[1] "/.Rlibrary"
+[1] "/.Rlibrary"
[2] "/Library/Frameworks/R.framework/Versions/3.1/Resources/library"
```
@@ -1278,18 +1277,18 @@ Look for the following output to confirm the changes:
**I received the following error message after launching H2O in RStudio and using `h2o.init` - what should I do to resolve this error?**
```
-Error in h2o.init() :
+Error in h2o.init() :
Version mismatch! H2O is running version 3.2.0.9 but R package is version 3.2.0.3
```
-This error is due to a version mismatch between the H2O R package and the running H2O instance. Make sure you are using the latest version of both files by downloading H2O from the [downloads page](http://h2o.ai/download/) and installing the latest version and that you have removed any previous H2O R package versions by running:
+This error is due to a version mismatch between the H2O R package and the running H2O instance. Make sure you are using the latest version of both files by downloading H2O from the [downloads page](http://h2o.ai/download/) and installing the latest version and that you have removed any previous H2O R package versions by running:
```
if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) }
if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") }
```
-Make sure to install the dependencies for the H2O R package as well:
+Make sure to install the dependencies for the H2O R package as well:
```
if (! ("methods" %in% rownames(installed.packages()))) { install.packages("methods") }
@@ -1303,7 +1302,7 @@ if (! ("utils" %in% rownames(installed.packages()))) { install.packages("utils")
```
-Finally, install the latest version of the H2O package for R:
+Finally, install the latest version of the H2O package for R:
```
install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/master/{{build_number}}/R")))
@@ -1311,11 +1310,11 @@ library(h2o)
localH2O = h2o.init(nthreads=-1)
```
-If your R version is older than the H2O R package, upgrade your R version using `update.packages(checkBuilt=TRUE, ask=FALSE)`.
+If your R version is older than the H2O R package, upgrade your R version using `update.packages(checkBuilt=TRUE, ask=FALSE)`.
---
-**I received the following error message after trying to run some code - what should I do?**
+**I received the following error message after trying to run some code - what should I do?**
```
> fit <- h2o.deeplearning(x=2:4, y=1, training_frame=train_hex)
@@ -1326,26 +1325,26 @@ In addition: Warning message:
Not all shim outputs are fully supported, please see ?h2o.shim for more information
```
-Remove the `h2o.shim(enable=TRUE)` line and try running the code again. Note that the `h2o.shim` is only a way to notify users of previous versions of H2O about changes to the H2O R package - it will not revise your code, but provides suggested replacements for deprecated commands and parameters.
+Remove the `h2o.shim(enable=TRUE)` line and try running the code again. Note that the `h2o.shim` is only a way to notify users of previous versions of H2O about changes to the H2O R package - it will not revise your code, but provides suggested replacements for deprecated commands and parameters.
---
**How do I extract the model weights from a model I've creating using H2O in R? I've enabled `extract_model_weights_and_biases`, but the output refers to a file I can't open in R.**
-For an example of how to extract weights and biases from a model, refer to the following repo location on [GitHub](https://github.com/h2oai/h2o-3/blob/master/h2o-r/tests/testdir_algos/deeplearning/runit_deeplearning_weights_and_biases.R).
+For an example of how to extract weights and biases from a model, refer to the following repo location on [GitHub](https://github.com/h2oai/h2o-3/blob/master/h2o-r/tests/testdir_algos/deeplearning/runit_deeplearning_weights_and_biases.R).
---
**How do I extract the run time of my model as output?**
-For the following example:
+For the following example:
```
out.h2o.rf = h2o.randomForest( x=c("x1", "x2", "x3", "w"), y="y", training_frame=h2o.df.train, seed=555, model_id= "my.model.1st.try.out.h2o.rf" )
```
-Use `out.h2o.rf@model$run_time` to determine the value of the `run_time` variable.
+Use `out.h2o.rf@model$run_time` to determine the value of the `run_time` variable.
---
@@ -1358,9 +1357,9 @@ We strongly recommend using `h2o.group_by` for this function instead of `h2o.ddp
newframe <- h2o.group_by(h2oframe, by="footwear_category", nrow("email_event_click_ct"), sum("email_event_click_ct"), mean("email_event_click_ct"), sd("email_event_click_ct"), gb.control = list( col.names=c("count", "total_email_event_click_ct", "avg_email_event_click_ct", "std_email_event_click_ct") ) )
```
-Using `gb.control` is optional; here it is included so the column names are user-configurable.
+Using `gb.control` is optional; here it is included so the column names are user-configurable.
-The `by` option can take a list of columns if you want to group by more than one column to compute the summary as shown in the following example:
+The `by` option can take a list of columns if you want to group by more than one column to compute the summary as shown in the following example:
```
newframe <- h2o.group_by(h2oframe, by=c("footwear_category","age_group"), nrow("email_event_click_ct"), sum("email_event_click_ct"), mean("email_event_click_ct"), sd("email_event_click_ct"), gb.control = list( col.names=c("count", "total_email_event_click_ct", "avg_email_event_click_ct", "std_email_event_click_ct") ) )
@@ -1372,13 +1371,13 @@ newframe <- h2o.group_by(h2oframe, by=c("footwear_category","age_group"), nrow("
**I'm using CentOS and I want to run H2O in R - are there any dependencies I need to install?**
-Yes, make sure to install `libcurl`, which allows H2O to communicate with R. We also recommend disabling SElinux and any firewalls, at least initially until you have confirmed H2O can initialize.
+Yes, make sure to install `libcurl`, which allows H2O to communicate with R. We also recommend disabling SElinux and any firewalls, at least initially until you have confirmed H2O can initialize.
---
**How do I change variable/header names on an H2O frame in R?**
-There are two ways to change header names. To specify the headers during parsing, import the headers in R and then specify the header as the column name when the actual data frame is imported:
+There are two ways to change header names. To specify the headers during parsing, import the headers in R and then specify the header as the column name when the actual data frame is imported:
```
header <- h2o.importFile(path = pathToHeader)
@@ -1386,14 +1385,14 @@ data <- h2o.importFile(path = pathToData, col.names = header)
data
```
-You can also use the `names()` function:
+You can also use the `names()` function:
```
header <- c("user", "specified", "column", "names")
data <- h2o.importFile(path = pathToData)
names(data) <- header
```
-To replace specific column names, you can also use a `sub/gsub` in R:
+To replace specific column names, you can also use a `sub/gsub` in R:
```
header <- c("user", "specified", "column", "names")
@@ -1409,13 +1408,13 @@ names(data) <- sub(pattern = "user", replacement = "computer", x = names(header)
**My R terminal crashed - how can I re-access my H2O frame?**
-Launch H2O and use your web browser to access the web UI, Flow, at `localhost:54321`. Click the **Data** menu, then click **List All Frames**. Copy the frame ID, then run `h2o.ls()` in R to list all the frames, or use the frame ID in the following code (replacing `YOUR_FRAME_ID` with the frame ID):
+Launch H2O and use your web browser to access the web UI, Flow, at `localhost:54321`. Click the **Data** menu, then click **List All Frames**. Copy the frame ID, then run `h2o.ls()` in R to list all the frames, or use the frame ID in the following code (replacing `YOUR_FRAME_ID` with the frame ID):
```
library(h2o)
localH2O = h2o.init(ip="sri.h2o.ai", port=54321, startH2O = F, strict_version_check=T)
data_frame <- h2o.getFrame(frame_id = "YOUR_FRAME_ID")
-```
+```
---
**How do I remove rows containing NAs in an H2OFrame?**
@@ -1440,19 +1439,19 @@ Removing rows 1, 3, 4, 5 to get:
6 0 1 2 3 2
```
-Use `na.omit(myFrame)`, where `myFrame` represents the name of the frame you are editing.
+Use `na.omit(myFrame)`, where `myFrame` represents the name of the frame you are editing.
---
**I installed H2O in R using OS X and updated all the dependencies, but the following error message displayed: `Error in .h2o.doSafeREST(h2oRestApiVersion = h2oRestApiVersion, Unexpected CURL error: Empty reply from server` - what should I do?**
-This error message displays if the `JAVA_HOME` environment variable is not set correctly. The `JAVA_HOME` variable is likely points to Apple Java version 6 instead of Oracle Java version 8.
+This error message displays if the `JAVA_HOME` environment variable is not set correctly. The `JAVA_HOME` variable is likely points to Apple Java version 6 instead of Oracle Java version 8.
-If you are running OS X 10.7 or earlier, enter the following in Terminal:
+If you are running OS X 10.7 or earlier, enter the following in Terminal:
`export JAVA_HOME=/Library/Internet\ Plug-Ins/JavaAppletPlugin.plugin/Contents/Home`
-If you are running OS X 10.8 or later, modify the launchd.plist by entering the following in Terminal:
+If you are running OS X 10.8 or later, modify the launchd.plist by entering the following in Terminal:
```
cat << EOF | sudo tee /Library/LaunchDaemons/setenv.JAVA_HOME.plist
@@ -1486,7 +1485,7 @@ in progress - commenting out until complete
**How do I extract the variable importance from the output in R?**
-Launch R, then enter the following:
+Launch R, then enter the following:
```
library(h2o)
@@ -1510,7 +1509,7 @@ is.data.frame(m@model$varimp)
# [1] TRUE
names(m@model$varimp)
-# [1] "Relative importance" "Scaled.Values" "Percent.Influence"
+# [1] "Relative importance" "Scaled.Values" "Percent.Influence"
rownames(m@model$varimp)
# [1] "Petal.Width" "Petal.Length" "Sepal.Length" "Sepal.Width"
@@ -1539,14 +1538,14 @@ newframe$avg_email_event_click_ct2 = newframe$total_email_event_click_ct / newfr
**How are the results of `h2o.predict` displayed?**
-The order of the rows in the results for `h2o.predict` is the same as the order in which the data was loaded, even if some rows fail (for example, due to missing values or unseen factor levels). To bind a per-row identifier, use `cbind`.
+The order of the rows in the results for `h2o.predict` is the same as the order in which the data was loaded, even if some rows fail (for example, due to missing values or unseen factor levels). To bind a per-row identifier, use `cbind`.
---
**How do I view all the variable importances for a model?**
-By default, H2O returns the top five and lowest five variable importances.
-To view all the variable importances, use the following:
+By default, H2O returns the top five and lowest five variable importances.
+To view all the variable importances, use the following:
```
model <- h2o.getModel(model_id = "my_H2O_modelID",conn=localH2O)
@@ -1559,7 +1558,7 @@ varimp<-as.data.frame(h2o.varimp(model))
**How do I add random noise to a column in an H2O frame?**
-To add random noise to a column in an H2O frame, refer to the following example:
+To add random noise to a column in an H2O frame, refer to the following example:
```
h2o.init()
@@ -1581,24 +1580,24 @@ new_fr
**What are the advantages of using Sparkling Water compared with H2O?**
-Sparkling Water contains the same features and functionality as H2O but provides a way to use H2O with [Spark](http://spark.apache.org/), a large-scale cluster framework.
+Sparkling Water contains the same features and functionality as H2O but provides a way to use H2O with [Spark](http://spark.apache.org/), a large-scale cluster framework.
-Sparkling Water is ideal for H2O users who need to manage large clusters for their data processing needs and want to transfer data from Spark to H2O (or vice versa).
+Sparkling Water is ideal for H2O users who need to manage large clusters for their data processing needs and want to transfer data from Spark to H2O (or vice versa).
-There is also a Python interface available to enable access to Sparkling Water directly from PySpark.
+There is also a Python interface available to enable access to Sparkling Water directly from PySpark.
----
+---
**How do I filter an H2OFrame using Sparkling Water?**
-Filtering columns is easy: just remove the unnecessary columns or create a new H2OFrame from the columns you want to include (`Frame(String[] names, Vec[] vec)`), then make the H2OFrame wrapper around it (`new H2OFrame(frame)`).
+Filtering columns is easy: just remove the unnecessary columns or create a new H2OFrame from the columns you want to include (`Frame(String[] names, Vec[] vec)`), then make the H2OFrame wrapper around it (`new H2OFrame(frame)`).
-Filtering rows is a little bit harder. There are two ways:
+Filtering rows is a little bit harder. There are two ways:
-- Create an additional binary vector holding `1/0` for the `in/out` sample (make sure to take this additional vector into account in your computations). This solution is quite cheap, since you do not duplicate data - just create a simple vector in a data walk.
+- Create an additional binary vector holding `1/0` for the `in/out` sample (make sure to take this additional vector into account in your computations). This solution is quite cheap, since you do not duplicate data - just create a simple vector in a data walk.
+
+ or
- or
-
- Create a new frame with the filtered rows. This is a harder task, since you have to copy data. For reference, look at the #deepSlice call on Frame (`H2OFrame`)
@@ -1607,7 +1606,7 @@ Filtering rows is a little bit harder. There are two ways:
**How can I save and load a K-means model using Sparkling Water?**
-The following example code defines the save and load functions explicitly.
+The following example code defines the save and load functions explicitly.
```
import water._
@@ -1640,7 +1639,7 @@ def loadH2OModel[M <: Model[_, _, _]](source: URI) : M = {
val l = new ObjectTreeBinarySerializer().load(source)
l.get(0).get().asInstanceOf[M]
}
-
+
// Load model
val h2oModel: Model[_, _, _] = loadH2OModel(new File("../h2omodel.bin").toURI)
```
@@ -1650,7 +1649,7 @@ val h2oModel: Model[_, _, _] = loadH2OModel(new File("../h2omodel.bin").toURI)
**How do I inspect H2O using Flow while a droplet is running?**
-If your droplet execution time is very short, add a simple sleep statement to your code:
+If your droplet execution time is very short, add a simple sleep statement to your code:
`Thread.sleep(...)`
@@ -1658,11 +1657,11 @@ If your droplet execution time is very short, add a simple sleep statement to yo
**How do I change the memory size of the executors in a droplet?**
-There are two ways to do this:
+There are two ways to do this:
- Change your default Spark setup in `$SPARK_HOME/conf/spark-defaults.conf`
- or
+ or
- Pass `--conf` via spark-submit when you launch your droplet (e.g., `$SPARK_HOME/bin/spark-submit --conf spark.executor.memory=4g --master $MASTER --class org.my.Droplet $TOPDIR/assembly/build/libs/droplet.jar`
@@ -1684,38 +1683,38 @@ water.DException$DistributedException: from /10.23.36.177:54321; by class water.
at water.MRTask.compute2(MRTask.java:398)
```
-This error output displays if the input file is not present on all nodes. Because of the way that Sparkling Water distributes data, the input file is required on all nodes (including remote), not just the primary node. Make sure there is a copy of the input file on all the nodes, then try again.
+This error output displays if the input file is not present on all nodes. Because of the way that Sparkling Water distributes data, the input file is required on all nodes (including remote), not just the primary node. Make sure there is a copy of the input file on all the nodes, then try again.
---
**Are there any drawbacks to using Sparkling Water compared to standalone H2O?**
-The version of H2O embedded in Sparkling Water is the same as the standalone version.
+The version of H2O embedded in Sparkling Water is the same as the standalone version.
---
**How do I use Sparkling Water from the Spark shell?**
-There are two methods:
+There are two methods:
- Use `$SPARK_HOME/bin/spark-shell --packages ai.h2o:sparkling-water-core_2.10:1.3.3`
- or
-
+ or
+
- `bin/sparkling-shell`
-The software distribution provides example scripts in the `examples/scripts` directory:
+The software distribution provides example scripts in the `examples/scripts` directory:
`bin/sparkling-shell -i examples/scripts/chicagoCrimeSmallShell.script.scala`
-For either method, initialize H2O as shown in the following example:
+For either method, initialize H2O as shown in the following example:
```
import org.apache.spark.h2o._
val h2oContext = new H2OContext(sc).start()
```
-After successfully launching H2O, the following output displays:
+After successfully launching H2O, the following output displays:
```
Sparkling Water Context:
@@ -1729,27 +1728,27 @@ Sparkling Water Context:
------------------------
Open H2O Flow in browser: http://172.16.2.223:54327 (CMD + click in Mac OSX)
-
+
```
---
**How do I use H2O with Spark Submit?**
-Spark Submit is for submitting self-contained applications. For more information, refer to the [Spark documentation](https://spark.apache.org/docs/latest/quick-start.html#self-contained-applications).
+Spark Submit is for submitting self-contained applications. For more information, refer to the [Spark documentation](https://spark.apache.org/docs/latest/quick-start.html#self-contained-applications).
-First, initialize H2O:
+First, initialize H2O:
```
import org.apache.spark.h2o._
val h2oContext = new H2OContext(sc).start()
```
-The Sparkling Water distribution provides several examples of self-contained applications built with Sparkling Water. To run the examples:
+The Sparkling Water distribution provides several examples of self-contained applications built with Sparkling Water. To run the examples:
`bin/run-example.sh ChicagoCrimeAppSmall`
-The "magic" behind `run-example.sh` is a regular Spark Submit:
+The "magic" behind `run-example.sh` is a regular Spark Submit:
`$SPARK_HOME/bin/spark-submit ChicagoCrimeAppSmall --packages ai.h2o:sparkling-water-core_2.10:1.3.3 --packages ai.h2o:sparkling-water-examples_2.10:1.3.3`
@@ -1757,7 +1756,7 @@ The "magic" behind `run-example.sh` is a regular Spark Submit:
**How do I use Sparkling Water with Databricks cloud?**
-Sparkling Water compatibility with Databricks cloud is still in development.
+Sparkling Water compatibility with Databricks cloud is still in development.
@@ -1765,41 +1764,41 @@ Sparkling Water compatibility with Databricks cloud is still in development.
**How do I develop applications with Sparkling Water?**
-For a regular Spark application (a self-contained application as described in the [Spark documentation](https://spark.apache.org/docs/latest/quick-start.html#self-contained-applications)), the app needs to initialize `H2OServices` via `H2OContext`:
+For a regular Spark application (a self-contained application as described in the [Spark documentation](https://spark.apache.org/docs/latest/quick-start.html#self-contained-applications)), the app needs to initialize `H2OServices` via `H2OContext`:
```
import org.apache.spark.h2o._
val h2oContext = new H2OContext(sc).start()
```
-For more information, refer to the [Sparkling Water development documentation](https://github.com/h2oai/sparkling-water/blob/master/DEVEL.md).
+For more information, refer to the [Sparkling Water development documentation](https://github.com/h2oai/sparkling-water/blob/master/DEVEL.md).
---
**How do I connect to Sparkling Water from R or Python?**
-After starting `H2OServices` by starting `H2OContext`, point your client to the IP address and port number specified in `H2OContext`.
+After starting `H2OServices` by starting `H2OContext`, point your client to the IP address and port number specified in `H2OContext`.
---
**I'm getting a `java.lang.ArrayIndexOutOfBoundsException` when I try to run Sparkling Water - what do I need to do to resolve this error?**
-This error message displays if you have not set up the `H2OContext` before running Sparkling Water. To set up the `H2OContext`:
+This error message displays if you have not set up the `H2OContext` before running Sparkling Water. To set up the `H2OContext`:
```
import org.apache.spark.h2o._
val h2oContext = new H2OContext(sc)
```
-After setting up `H2OContext`, try to run Sparkling Water again.
+After setting up `H2OContext`, try to run Sparkling Water again.
---
##Tunneling between servers with H2O
-To tunnel between servers (for example, due to firewalls):
+To tunnel between servers (for example, due to firewalls):
1. Use ssh to log in to the machine where H2O will run.
-2. Start an instance of H2O by locating the working directory and calling a java command similar to the following example.
+2. Start an instance of H2O by locating the working directory and calling a java command similar to the following example.
The port number chosen here is arbitrary; yours may be different.
@@ -1844,7 +1843,7 @@ To see this in action note that the web UI is pointed at
localhost:55577, but that the cluster status shows the cluster running
on 192.168.1.173:55599
-
+
---
diff --git a/h2o-docs/src/product/howto/H2O-DevHadoop.md b/h2o-docs/src/product/howto/H2O-DevHadoop.md
index 164918ec7fb..850a556a627 100644
--- a/h2o-docs/src/product/howto/H2O-DevHadoop.md
+++ b/h2o-docs/src/product/howto/H2O-DevHadoop.md
@@ -1,6 +1,6 @@
# ... On Hadoop
-Currently supported versions:
+Currently supported versions:
- CDH 5.2
- CDH 5.3
@@ -12,18 +12,17 @@ Currently supported versions:
- HDP 2.2
- HDP 2.3
- HDP 2.4
-- MapR 3.1.1
- MapR 4.0.1
- MapR 5.0
- MapR 5.1
-**Important Points to Remember**:
+**Important Points to Remember**:
- The command used to launch H2O differs from previous versions (refer to the [Tutorial](#Tutorial) section)
-- Launching H2O on Hadoop requires at least 6 GB of memory
+- Launching H2O on Hadoop requires at least 6 GB of memory
- Each H2O node runs as a mapper
- Run only one mapper per host
-- There are no combiners or reducers
+- There are no combiners or reducers
- Each H2O cluster must have a unique job name
- `-mapperXmx`, `-nodes`, and `-output` are required
- Root permissions are not required - just unzip the H2O .zip file on any single node
@@ -32,17 +31,17 @@ Currently supported versions:
Prerequisite: Open Communication Paths
--------------------------------------
-H2O communicates using two communication paths. Verify these are open and available for use by H2O.
+H2O communicates using two communication paths. Verify these are open and available for use by H2O.
**Path 1: mapper to driver**
-Optionally specify this port using the `-driverport` option in the `hadoop jar` command (see "Hadoop Launch Parameters" below). This port is opened on the driver host (the host where you entered the `hadoop jar` command). By default, this port is chosen randomly by the operating system.
+Optionally specify this port using the `-driverport` option in the `hadoop jar` command (see "Hadoop Launch Parameters" below). This port is opened on the driver host (the host where you entered the `hadoop jar` command). By default, this port is chosen randomly by the operating system.
**Path 2: mapper to mapper**
-Optionally specify this port using the `-baseport` option in the `hadoop jar` command (refer to [Hadoop Launch Parameters](#LaunchParam) below. This port and the next subsequent port are opened on the mapper hosts (the Hadoop worker nodes) where the H2O mapper nodes are placed by the Resource Manager. By default, ports 54321 (TCP) and 54322 (TCP & UDP) are used.
+Optionally specify this port using the `-baseport` option in the `hadoop jar` command (refer to [Hadoop Launch Parameters](#LaunchParam) below. This port and the next subsequent port are opened on the mapper hosts (the Hadoop worker nodes) where the H2O mapper nodes are placed by the Resource Manager. By default, ports 54321 (TCP) and 54322 (TCP & UDP) are used.
-The mapper port is adaptive: if 54321 and 54322 are not available, H2O will try 54323 and 54324 and so on. The mapper port is designed to be adaptive because sometimes if the YARN cluster is low on resources, YARN will place two H2O mappers for the same H2O cluster request on the same physical host. For this reason, we recommend opening a range of more than two ports (20 ports should be sufficient).
+The mapper port is adaptive: if 54321 and 54322 are not available, H2O will try 54323 and 54324 and so on. The mapper port is designed to be adaptive because sometimes if the YARN cluster is low on resources, YARN will place two H2O mappers for the same H2O cluster request on the same physical host. For this reason, we recommend opening a range of more than two ports (20 ports should be sufficient).
----
@@ -68,7 +67,7 @@ The following tutorial will walk the user through the download or build of H2O a
The above command launches a 6g node of H2O. We recommend you launch the cluster with at least four times the memory of your data file size.
- - *mapperXmx* is the mapper size or the amount of memory allocated to each node. Specify at least 6 GB.
+ - *mapperXmx* is the mapper size or the amount of memory allocated to each node. Specify at least 6 GB.
- *nodes* is the number of nodes requested to form the cluster.
@@ -86,7 +85,7 @@ review the output from your command after the nodes has clouded up and formed a
H2O node 172.16.2.184:54321 requested flatfile
Sending flatfiles to nodes...
[Sending flatfile to node 172.16.2.184:54321]
- H2O node 172.16.2.184:54321 reports H2O cluster size 1
+ H2O node 172.16.2.184:54321 reports H2O cluster size 1
H2O cluster (1 nodes) is up
Blocking until the H2O cluster shuts down...
@@ -96,50 +95,50 @@ review the output from your command after the nodes has clouded up and formed a
Hadoop Launch Parameters
------------------------
-- `-h | -help`: Display help
+- `-h | -help`: Display help
- `-jobname `: Specify a job name for the Jobtracker to use; the default is `H2O_nnnnn` (where n is chosen randomly)
-- `-driverif driver callback interface>`: Specify the IP address for callback messages from the mapper to the driver.
-- `-driverport callback interface>`: Specify the port number for callback messages from the mapper to the driver.
-- `-network [,]`: Specify the IPv4 network(s) to bind to the H2O nodes; multiple networks can be specified to force H2O to use the specified host in the Hadoop cluster. `10.1.2.0/24` allows 256 possibilities.
-- `-timeout `: Specify the timeout duration (in seconds) to wait for the cluster to form before failing.
- **Note**: The default value is 120 seconds; if your cluster is very busy, this may not provide enough time for the nodes to launch. If H2O does not launch, try increasing this value (for example, `-timeout 600`).
+- `-driverif driver callback interface>`: Specify the IP address for callback messages from the mapper to the driver.
+- `-driverport callback interface>`: Specify the port number for callback messages from the mapper to the driver.
+- `-network [,]`: Specify the IPv4 network(s) to bind to the H2O nodes; multiple networks can be specified to force H2O to use the specified host in the Hadoop cluster. `10.1.2.0/24` allows 256 possibilities.
+- `-timeout `: Specify the timeout duration (in seconds) to wait for the cluster to form before failing.
+ **Note**: The default value is 120 seconds; if your cluster is very busy, this may not provide enough time for the nodes to launch. If H2O does not launch, try increasing this value (for example, `-timeout 600`).
- `-disown`: Exit the driver after the cluster forms.
-- `-notify `: Specify a file to write when the cluster is up. The file contains the IP and port of the embedded web server for one of the nodes in the cluster. All mappers must start before the H2O cloud is considered "up".
-- `-mapperXmx `: Specify the amount of memory to allocate to H2O (at least 6g).
-- `-extramempercent <0-20>`: Specify the extra memory for internal JVM use outside of the Java heap. This is a percentage of `mapperXmx`.
-- `-n | -nodes `: Specify the number of nodes.
-- `-nthreads `: Specify the number of CPUs to use. Enter `-1` to use all CPUs on the host, or enter a positive integer.
-- `-baseport `: Specify the initialization port for the H2O nodes. The default is `54321`.
-- `-ea`: Enable assertions to verify boolean expressions for error detection.
-- `-verbose:gc`: Include heap and garbage collection information in the logs.
-- `-XX:+PrintGCDetails`: Include a short message after each garbage collection.
-- `-license `: Specify the directory of local filesytem location and the license file name.
-- `-o | -output `: Specify the HDFS directory for the output.
+- `-notify `: Specify a file to write when the cluster is up. The file contains the IP and port of the embedded web server for one of the nodes in the cluster. All mappers must start before the H2O cloud is considered "up".
+- `-mapperXmx `: Specify the amount of memory to allocate to H2O (at least 6g).
+- `-extramempercent <0-20>`: Specify the extra memory for internal JVM use outside of the Java heap. This is a percentage of `mapperXmx`.
+- `-n | -nodes `: Specify the number of nodes.
+- `-nthreads `: Specify the number of CPUs to use. Enter `-1` to use all CPUs on the host, or enter a positive integer.
+- `-baseport `: Specify the initialization port for the H2O nodes. The default is `54321`.
+- `-ea`: Enable assertions to verify boolean expressions for error detection.
+- `-verbose:gc`: Include heap and garbage collection information in the logs.
+- `-XX:+PrintGCDetails`: Include a short message after each garbage collection.
+- `-license `: Specify the directory of local filesytem location and the license file name.
+- `-o | -output `: Specify the HDFS directory for the output.
- `-flow_dir `: Specify the directory for saved flows. By default, H2O will try to find the HDFS home directory to use as the directory for flows. If the HDFS home directory is not found, flows cannot be saved unless a directory is specified using `-flow_dir`.
##Accessing S3 Data from Hadoop
-H2O launched on Hadoop can access S3 Data in addition to to HDFS. To enable access, follow the instructions below.
+H2O launched on Hadoop can access S3 Data in addition to to HDFS. To enable access, follow the instructions below.
-Edit Hadoop's `core-site.xml`, then set the `HADOOP_CONF_DIR` environment property to the directory containing the `core-site.xml` file. For an example `core-site.xml` file, refer to [Core-site.xml](#Example). Typically, the configuration directory for most Hadoop distributions is `/etc/hadoop/conf`.
+Edit Hadoop's `core-site.xml`, then set the `HADOOP_CONF_DIR` environment property to the directory containing the `core-site.xml` file. For an example `core-site.xml` file, refer to [Core-site.xml](#Example). Typically, the configuration directory for most Hadoop distributions is `/etc/hadoop/conf`.
You can also pass the S3 credentials when launching H2O with the Hadoop jar command. Use the `-D` flag to pass the credentials:
hadoop jar h2odriver.jar -Dfs.s3.awsAccessKeyId="${AWS_ACCESS_KEY}" -Dfs.s3n.awsSecretAccessKey="${AWS_SECRET_KEY}" -n 3 -mapperXmx 10g -output outputDirectory
-
+
where `AWS_ACCESS_KEY` represents your user name and `AWS_SECRET_KEY` represents your password.
-Then import the data with the S3 URL path:
+Then import the data with the S3 URL path:
- To import the data from the Flow API:
- importFiles [ "s3n:/path/to/bucket/file/file.tab.gz" ]
+ importFiles [ "s3n:/path/to/bucket/file/file.tab.gz" ]
- To import the data from the R API:
-
+
h2o.importFile(path = "s3n://bucket/path/to/file.csv")
- To import the data from the Python API:
-
+
h2o.import_frame(path = "s3n://bucket/path/to/file.csv")
diff --git a/h2o-docs/src/product/welcome.rst b/h2o-docs/src/product/welcome.rst
index 6e139f642b5..6c7fd15ce1b 100644
--- a/h2o-docs/src/product/welcome.rst
+++ b/h2o-docs/src/product/welcome.rst
@@ -15,25 +15,25 @@ Requirements
At a minimum, we recommend the following for compatibility with H2O:
- **Operating Systems**:
-
+
- Windows 7 or later
- OS X 10.9 or later
- Ubuntu 12.04
- RHEL/CentOS 6 or later
-
+
- **Languages**: Scala, R, and Python are not required to use H2O unless you want to use H2O in those environments, but Java is always required. Supported versions include:
- Java 7 or later. **Note**: Java 9 is not yet released and is not currently supported.
- - To build H2O or run H2O tests, the 64-bit JDK is required.
- - To run the H2O binary using either the command line, R, or Python packages, only 64-bit JRE is required.
+ - To build H2O or run H2O tests, the 64-bit JDK is required.
+ - To run the H2O binary using either the command line, R, or Python packages, only 64-bit JRE is required.
- Both of these are available on the `Java download page `__.
- Scala 2.10 or later
- R version 3 or later
- Python 2.7.x or 3.5.x
-
-- **Browser**: An internet browser is required to use H2O's web UI, Flow. Supported versions include the latest version of Chrome, Firefox, Safari, or Internet Explorer.
+
+- **Browser**: An internet browser is required to use H2O's web UI, Flow. Supported versions include the latest version of Chrome, Firefox, Safari, or Internet Explorer.
Additional Requirements
~~~~~~~~~~~~~~~~~~~~~~~
@@ -41,12 +41,11 @@ Additional Requirements
- **Hadoop**: Hadoop is not required to run H2O unless you want to deploy H2O on a Hadoop cluster. Supported versions are listed on the `Download page `_ (when you select the Install on Hadoop tab) and include:
- Cloudera CDH 5.2 or later (5.3 is recommended)
- - MapR 3.1.1 or later
- - Hortonworks HDP 2.1 or later
-
+ - Hortonworks HDP 2.1 or later
+
Refer to the :ref:`on-hadoop` section for detailed information.
-- **Conda 2.7 or 3.5 repo**: Conda is not required to run H2O unless you want to run H2O on the Anaconda Cloud. Refer to the :ref:`anaconda` section for more information.
+- **Conda 2.7 or 3.5 repo**: Conda is not required to run H2O unless you want to run H2O on the Anaconda Cloud. Refer to the :ref:`anaconda` section for more information.
- **Spark**: Version 1.6 or 2.0. Spark is only required if you want to run `Sparkling Water `__.
@@ -75,7 +74,7 @@ learn more:
- :ref:`Data_Science`: This section describes the science behind our algorithms and provides a detailed, per-algo view of each model type.
-- `GitHub Help `_: The GitHub Help system is a useful resource for becoming familiar with Git.
+- `GitHub Help `_: The GitHub Help system is a useful resource for becoming familiar with Git.
New User Quick Start
~~~~~~~~~~~~~~~~~~~~
@@ -87,7 +86,7 @@ New users can follow the steps below to quickly get up and running with H2O dire
::
user$ mkdir ~/Desktop/repos
-
+
2. Change directories to that new folder, and then clone the repository. Notice that the prompt changes when you change directories.
::
@@ -102,7 +101,7 @@ New users can follow the steps below to quickly get up and running with H2O dire
repos user$ cd h2o-3
h2o-3 user$
-4. Run the following command to retrieve sample datasets. These datasets are used throughout this User Guide and within the `Booklets `_.
+4. Run the following command to retrieve sample datasets. These datasets are used throughout this User Guide and within the `Booklets `_.
::
@@ -127,11 +126,11 @@ At this point, determine whether you want to complete this quick start in either
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
>
-
+
# Copy and paste the following commands in R to download dependency packages.
> pkgs <- c("methods","statmod","stats","graphics","RCurl","jsonlite","tools","utils")
> for (pkg in pkgs) {if (! (pkg %in% rownames(installed.packages()))) { install.packages(pkg) }}
-
+
# Run the following command to load the H2O:
> library(h2o)
@@ -171,7 +170,7 @@ At this point, determine whether you want to complete this quick start in either
# Start python
h2o-3 user$ python
- >>>
+ >>>
# Run the following command to import the H2O module:
>>> import h2o
@@ -394,7 +393,7 @@ After starting multiple "worker" node processes in addition to the JUnit test pr
- `H2O Droplet Project Templates `_: This page provides template info for projects created in Java, Scala, or Sparkling Water.
- H2O Scala API Developer Documentation for `Scala 2.11 <../h2o-scala_2.11/scaladoc/index.html>`__ or `Scala 2.10 <../h2o-scala_2.10/scaladoc/index.html>`__: The definitive Scala API guide for H2O.
-
+
- `Hacking Algos `_: This blog post by Cliff walks you through building a new algorithm, using K-Means, Quantiles, and Grep as examples.
- `KV Store Guide `_: Learn more about performance characteristics when implementing new algorithms.
@@ -423,7 +422,6 @@ Supported Versions
- HDP 2.3
- HDP 2.4
- HDP 2.5
-- MapR 3.1
- MapR 4.0
- MapR 5.0
- MapR 5.1
@@ -442,7 +440,7 @@ Supported Versions
Prerequisite: Open Communication Paths
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-H2O communicates using two communication paths. Verify these are open and available for use by H2O.
+H2O communicates using two communication paths. Verify these are open and available for use by H2O.
**Path 1: mapper to driver**
@@ -498,7 +496,7 @@ The following steps show you how to download or build H2O with Hadoop and the pa
H2O node 172.16.2.184:54321 requested flatfile
Sending flatfiles to nodes...
[Sending flatfile to node 172.16.2.184:54321]
- H2O node 172.16.2.184:54321 reports H2O cluster size 1
+ H2O node 172.16.2.184:54321 reports H2O cluster size 1
H2O cluster (1 nodes) is up
Blocking until the H2O cluster shuts down...
@@ -552,7 +550,7 @@ Then import the data with the S3 URL path:
::
- importFiles [ "s3n:/path/to/bucket/file/file.tab.gz" ]
+ importFiles [ "s3n:/path/to/bucket/file/file.tab.gz" ]
- To import the data from the R API:
@@ -840,7 +838,7 @@ EC2 Instances & S3 Storage
To use the Amazon Web Services (AWS) S3 storage solution, you will need to pass your S3 access credentials to H2O. This will allow you to access your data on S3 when importing data frames with path prefixes ``s3n://...``.
-To use the `Minio Cloud Storage `__, you will need to pass an endpoint in addition to access credentials.
+To use the `Minio Cloud Storage `__, you will need to pass an endpoint in addition to access credentials.
For security reasons, we recommend writing a script to read the access credentials that are stored in a separate file. This will not only keep your credentials from propagating to other locations, but it will also make it easier to change the credential information later.
@@ -866,7 +864,7 @@ When running H2O in standalone mode using the simple Java launch command, we can
fs.s3n.awsSecretAccessKey
[AWS SECRET ACCESS KEY]
-
+
2. Launch with the configuration file ``core-site.xml`` by entering the following in the command line:
@@ -891,7 +889,7 @@ When running H2O in standalone mode using the simple Java launch command, we can
- To import the data from the Python API:
::
-
+
h2o.import_file(path = "s3n://:@bucket/path/to/file.csv")
AWS Multi-Node Instance
@@ -899,47 +897,47 @@ AWS Multi-Node Instance
`Python `_ and the `boto `_ Python library are required to launch a multi-node instance of H2O on EC2. Confirm these dependencies are installed before proceeding.
-For more information, refer to the `H2O EC2 repo `_.
+For more information, refer to the `H2O EC2 repo `_.
-Build a cluster of EC2 instances by running the following commands on the host that can access the nodes using a public DNS name.
+Build a cluster of EC2 instances by running the following commands on the host that can access the nodes using a public DNS name.
+
+1. Edit `h2o-cluster-launch-instances.py` to include your SSH key name and security group name, as well as any other environment-specific variables.
-1. Edit `h2o-cluster-launch-instances.py` to include your SSH key name and security group name, as well as any other environment-specific variables.
-
::
./h2o-cluster-launch-instances.py
./h2o-cluster-distribute-h2o.sh
--OR--
-
- ::
+
+ ::
./h2o-cluster-launch-instances.py
./h2o-cluster-download-h2o.sh
- **Note**: The second method may be faster than the first because download pulls from S3.
+ **Note**: The second method may be faster than the first because download pulls from S3.
-2. Distribute the credentials using ``./h2o-cluster-distribute-aws-credentials.sh``.
+2. Distribute the credentials using ``./h2o-cluster-distribute-aws-credentials.sh``.
- **Note**: If you are running H2O using an IAM role, it is not necessary to distribute the AWS credentials to all the nodes in the cluster. The latest version of H2O can access the temporary access key.
+ **Note**: If you are running H2O using an IAM role, it is not necessary to distribute the AWS credentials to all the nodes in the cluster. The latest version of H2O can access the temporary access key.
**Caution**: Distributing the AWS credentials copies the Amazon `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` to the instances to enable S3 and S3N access. Use caution when adding your security keys to the cloud.
-3. Start H2O by launching one H2O node per EC2 instance:
-
- ::
+3. Start H2O by launching one H2O node per EC2 instance:
+
+ ::
./h2o-cluster-start-h2o.sh
- Wait 60 seconds after entering the command before entering it on the next node.
-
+ Wait 60 seconds after entering the command before entering it on the next node.
+
4. In your internet browser, substitute any of the public DNS node addresses for *IP_ADDRESS* in the following example: ``http://IP_ADDRESS:54321``
- To start H2O: ``./h2o-cluster-start-h2o.sh``
- To stop H2O: ``./h2o-cluster-stop-h2o.sh``
- - To shut down the cluster, use your `Amazon AWS console `_ to shut down the cluster manually.
+ - To shut down the cluster, use your `Amazon AWS console `_ to shut down the cluster manually.
- **Note**: To successfully import data, the data must reside in the same location on all nodes.
+ **Note**: To successfully import data, the data must reside in the same location on all nodes.
.. _minio:
@@ -1033,7 +1031,7 @@ The following is an example core-site.xml file:
fs.s3n.awsSecretAccessKey
insert secret key here
-
+
Launching H2O
@@ -1076,7 +1074,7 @@ For Windows users who do not have the ability to use ``ssh`` from the terminal,
Otherwise, download PuTTY and follow these instructions:
1. Launch the PuTTY Key Generator.
-2. Load your downloaded AWS pem key file.
+2. Load your downloaded AWS pem key file.
**Note:** To see the file, change the browser file type to "All".
@@ -1120,11 +1118,11 @@ Downloading Java and H2O
Using H2O with Microsoft Azure - BETA
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Microsoft Azure provides an important collection of cloud services, such as serverless computing, virtual machines, storage options, networking, and much more. Azure provides the tools for a user to create a Data Science environment with H2O.
+Microsoft Azure provides an important collection of cloud services, such as serverless computing, virtual machines, storage options, networking, and much more. Azure provides the tools for a user to create a Data Science environment with H2O.
This section describes the H2O Application for HDInsight on Microsoft Azure.
-**Note**: This feature is currently in Beta and should be used for testing purposes only.
+**Note**: This feature is currently in Beta and should be used for testing purposes only.
H2O Artificial Intelligence for Azure HDInsight
'''''''''''''''''''''''''''''''''''''''''''''''
@@ -1133,37 +1131,37 @@ The H2O Artificial Intelligence for Azure HDInsight is an application you can in
**Create the H2O AI for Azure HDInsight**
-Follow the steps below to create a new H2O Artificial Intelligence for Azure HDInsight.
+Follow the steps below to create a new H2O Artificial Intelligence for Azure HDInsight.
1. In your Azure portal at `https://portal.azure.com `__, search for H2O, and select **H2O Artificial Intelligence for HDInsight**.
-2. Click the **Create** button, and follow the UI instructions.
+2. Click the **Create** button, and follow the UI instructions.
- **Note**: H2O for HDInsight is exclusively for Spark HDI clusters version 3.5 (HDI v3.5).
+ **Note**: H2O for HDInsight is exclusively for Spark HDI clusters version 3.5 (HDI v3.5).
.. figure:: images/azure_select_h2o_hdinsight.png
:alt: Select H2O Artificial Intelligence for HDInsight
3. In the next screen, under **Basics**, change the Cluster Type to Spark 2.0.2. Sparkling Water is currently configured to work only on Spark 2.0 and above.
-4. On the **Applications** tab, select and accept the Terms of Use for H2O.
+4. On the **Applications** tab, select and accept the Terms of Use for H2O.
.. figure:: images/azure_terms_of_use.png
:alt: Terms of Use for H2O
-5. On the **Credentials** tab, specify the following:
+5. On the **Credentials** tab, specify the following:
- Cluster Login username and password. These are used to connect to your cluster.
- SSH Username and password. These are used to connect direcly to the VM present in the cluster.
-6. On the **Data Source** tab, you can configure either a Blob Storage Account or a Data Lake Store. This is where your HDFS system will be located.
+6. On the **Data Source** tab, you can configure either a Blob Storage Account or a Data Lake Store. This is where your HDFS system will be located.
-7. On the **Cluster Size** tab, select the number of workers nodes you want on your HDI Cluster. Note that you can resize your cluster any time after creation.
+7. On the **Cluster Size** tab, select the number of workers nodes you want on your HDI Cluster. Note that you can resize your cluster any time after creation.
-8. Click **Create** to begin the cluster creation. Note that the cluster creation process can take up to 30 minutes.
+8. Click **Create** to begin the cluster creation. Note that the cluster creation process can take up to 30 minutes.
9. Connect to your Jupyter Notebooks through
- **https://.azurehdinsight.net/jupyter**, and log in using the Cluster Login username and password that you previously created.
+ **https://.azurehdinsight.net/jupyter**, and log in using the Cluster Login username and password that you previously created.
10. In Jupyter, you will see 3 folders: H2O-PySparkling-Examples, PySpark Examples, and Scala Examples. Select H2O-PySparkling-Examples.
@@ -1191,9 +1189,9 @@ Troubleshooting Tips
Using H2O with IBM Data Science Experience - BETA
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The IBM Data Science Experience (DSX) provides an interactive, collaborative, cloud-based environment where data scientists can use multiple tools to activate their insights. With DSX, Data scientists can use the best of open source tools such as R and Python, tap into IBMs unique features, grow their capabilities, and share their successes.
+The IBM Data Science Experience (DSX) provides an interactive, collaborative, cloud-based environment where data scientists can use multiple tools to activate their insights. With DSX, Data scientists can use the best of open source tools such as R and Python, tap into IBMs unique features, grow their capabilities, and share their successes.
-This section show how simple it is to use H2O R with IBM DSX.
+This section show how simple it is to use H2O R with IBM DSX.
1. Sign in to `datascience.ibm.com `__. (Or select **Sign Up** if you do not yet have an account.)
@@ -1207,7 +1205,7 @@ This section show how simple it is to use H2O R with IBM DSX.
3. Install and start H2O R using the instructions included on the `H2O Download site `__. Note that this page opens by default to the **Download and Run** tab. Be sure to select the **Install in R** tab for R installation instructions. |install|
- You can also view a quick start video of installing and starting H2O in R by clicking `here `__.
+ You can also view a quick start video of installing and starting H2O in R by clicking `here `__.
.. |install| image:: images/ibm_install_in_r.png
:height: 24
diff --git a/h2o-hadoop/h2o-mapr3.1-assembly/build.gradle b/h2o-hadoop/h2o-mapr3.1-assembly/build.gradle
deleted file mode 100644
index ea98525ab2d..00000000000
--- a/h2o-hadoop/h2o-mapr3.1-assembly/build.gradle
+++ /dev/null
@@ -1,9 +0,0 @@
-ext {
- notYarn = true
- hadoopVersion = 'mapr3.1'
- hadoopMavenArtifactVersion = '1.0.3-mapr-3.1.1'
- maprExtraDependency = 'org.json:org.json:chargebee-1.0'
- orcSupported = false
-}
-
-apply from: '../assemblyjar.gradle'
\ No newline at end of file
diff --git a/h2o-hadoop/h2o-mapr3.1/build.gradle b/h2o-hadoop/h2o-mapr3.1/build.gradle
deleted file mode 100644
index 1f02514ce71..00000000000
--- a/h2o-hadoop/h2o-mapr3.1/build.gradle
+++ /dev/null
@@ -1,7 +0,0 @@
-ext {
- notYarn = true
- hadoopVersion = 'mapr3.1'
- hadoopMavenArtifactVersion = '1.0.3-mapr-3.1.1'
-}
-
-apply from: '../driverjar.gradle'
diff --git a/h2o-hadoop/h2o-mapreduce-generic/src/main/java/water/hadoop/h2odriver.java b/h2o-hadoop/h2o-mapreduce-generic/src/main/java/water/hadoop/h2odriver.java
index 56f1f8c8c7a..9bce1712313 100644
--- a/h2o-hadoop/h2o-mapreduce-generic/src/main/java/water/hadoop/h2odriver.java
+++ b/h2o-hadoop/h2o-mapreduce-generic/src/main/java/water/hadoop/h2odriver.java
@@ -8,6 +8,7 @@
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import water.H2O;
@@ -101,6 +102,9 @@
static String sessionTimeout = null;
static String userName = System.getProperty("user.name");
static boolean client = false;
+ static String runAsUser = null;
+ static String principal = null;
+ static String keytabPath = null;
// Runtime state that might be touched by different threads.
volatile ServerSocket driverCallbackSocket = null;
@@ -545,6 +549,7 @@ static void usage() {
" [-h | -help]\n" +
" [-jobname ]\n" +
" (Note nnnnn is chosen randomly to produce a unique name)\n" +
+ " [-principal -keytab | -run_as_user ]\n" +
" [-driverif driver callback interface>]\n" +
" [-driverport driver callback interface>]\n" +
" [-driverportrange driver callback interface>; eg: 50000-55000]\n" +
@@ -913,8 +918,16 @@ else if (s.equals("-user_name")) {
else if (s.equals("-client")) {
client = true;
driverArgs = false;
- }
- else {
+ } else if (s.equals("-run_as_user")) {
+ i++; if (i >= args.length) { usage(); }
+ runAsUser = args[i];
+ } else if (s.equals("-principal")) {
+ i++; if (i >= args.length) { usage(); }
+ principal = args[i];
+ } else if (s.equals("-keytab")) {
+ i++; if (i >= args.length) { usage (); }
+ keytabPath = args[i];
+ } else {
error("Unrecognized option " + s);
}
@@ -1020,6 +1033,18 @@ void validateArgs() {
}
}
+ if (principal != null || keytabPath != null) {
+ if (principal == null) {
+ error("keytab requires a valid principal (use the '-principal' option)");
+ }
+ if (keytabPath == null) {
+ error("principal requires a valid keytab path (use the '-keytab' option)");
+ }
+ if (runAsUser != null) {
+ error("cannot use '-keytab' or '-principal' with '-run_as_user''");
+ }
+ }
+
if (client && disown) {
error("client mode doesn't support the '-disown' option");
}
@@ -1272,6 +1297,16 @@ private int run2(String[] args) throws Exception {
// ---------------------
Configuration conf = getConf();
+ // Run impersonation options
+ if (principal != null && keytabPath != null) {
+ UserGroupInformation.setConfiguration(conf);
+ UserGroupInformation.loginUserFromKeytab(principal, keytabPath);
+ } else if (runAsUser != null) {
+ UserGroupInformation.setConfiguration(conf);
+ UserGroupInformation.setLoginUser(UserGroupInformation.createRemoteUser(runAsUser));
+ }
+
+
// Set memory parameters.
long processTotalPhysicalMemoryMegabytes;
{
diff --git a/make-dist.sh b/make-dist.sh
index 6fcba048417..af7e30dbde6 100755
--- a/make-dist.sh
+++ b/make-dist.sh
@@ -10,7 +10,7 @@ set -x
# Set common variables.
TOPDIR=$(cd `dirname $0` && pwd)
-HADOOP_VERSIONS="cdh5.2 cdh5.3 cdh5.4 cdh5.5 cdh5.6 cdh5.7 cdh5.8 hdp2.1 hdp2.2 hdp2.3 hdp2.4 hdp2.5 hdp2.6 mapr3.1 mapr4.0 mapr5.0 mapr5.1 iop4.2"
+HADOOP_VERSIONS="cdh5.2 cdh5.3 cdh5.4 cdh5.5 cdh5.6 cdh5.7 cdh5.8 hdp2.1 hdp2.2 hdp2.3 hdp2.4 hdp2.5 hdp2.6 mapr4.0 mapr5.0 mapr5.1 iop4.2"
function make_zip_common {
PROJECT_BASE=$1
diff --git a/py2/testdir_single_jvm/test_parse_covtype_2_maprfs.py b/py2/testdir_single_jvm/test_parse_covtype_2_maprfs.py
index 00c497da014..943e0fecf4d 100644
--- a/py2/testdir_single_jvm/test_parse_covtype_2_maprfs.py
+++ b/py2/testdir_single_jvm/test_parse_covtype_2_maprfs.py
@@ -36,13 +36,13 @@ def setUpClass(cls):
h2o.init(1,
enable_benchmark_log=True,
use_maprfs=True,
- hdfs_version='mapr3.1.1',
+ hdfs_version='mapr4.0',
hdfs_name_node='mr-0x2:7222')
# mayb these aren't set correctly with -uc and above,. Let's just set them here
# the import below will use them to form the uri
h2o.nodes[0].use_maprfs = True
h2o.nodes[0].use_hdfs = False
- h2o.nodes[0].hdfs_version = 'mapr3.1.1',
+ h2o.nodes[0].hdfs_version = 'mapr4.0',
h2o.nodes[0].hdfs_name_node = 'mr-0x2:7222'
@@ -124,7 +124,7 @@ def test_parse_covtype_2_maprfs(self):
a_node = h2o.nodes[0]
frames_result = a_node.frames(key=k, row_count=5)
# print "frames_result from the first parseResult key", dump_json(frames_result)
-
+
# FIX! switch this to look at the summary result
parseKeyIndexedCheck(frames_result, multiplyExpected)
# don't want to spill keys
diff --git a/settings.gradle b/settings.gradle
index d8b4782e720..086567a03ec 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -22,7 +22,7 @@ include 'h2o-parquet-parser'
include 'h2o-jaas-pam'
// GRPC support
-if ("true".equals(System.getenv("H2O_BUILD_GRPC"))) {
+if ("true".equals(System.getenv("H2O_BUILD_GRPC"))) {
include 'h2o-assemblies:py2o'
include 'h2o-grpc'
}
@@ -54,9 +54,9 @@ if (System.getProperty("user.name").equals("jenkins")
// Default hadoop build targets
def allTargets = [
- "cdh5.2", "cdh5.3", "cdh5.4", "cdh5.5", "cdh5.6", "cdh5.7", "cdh5.8",
- "hdp2.1", "hdp2.2", "hdp2.3", "hdp2.4", "hdp2.5", "hdp2.6",
- "mapr3.1", "mapr4.0", "mapr5.0", "mapr5.1","iop4.2"
+ "cdh5.2", "cdh5.3", "cdh5.4", "cdh5.5", "cdh5.6", "cdh5.7", "cdh5.8",
+ "hdp2.1", "hdp2.2", "hdp2.3", "hdp2.4", "hdp2.5", "hdp2.6",
+ "mapr4.0", "mapr5.0", "mapr5.1","iop4.2"
]
// Compute targets
def targets = System.getenv("H2O_TARGET") != null ? System.getenv("H2O_TARGET").split(",").collect { it.trim() } : allTargets