object BasicStatistics { def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName("BasicStatistics").master("local").getOrCreate() spark.sparkContext.setLogLevel("Error") import spark.implicits._
val data: Seq[linalg.Vector] = Seq( Vectors.sparse(4, Seq((0, 1.0), (3, -2.0))), Vectors.dense(4.0, 5.0, 0.0, 3.0), Vectors.dense(6.0, 7.0, 0.0, 8.0), Vectors.sparse(4, Seq((0, 9.0), (3, 1.0))) )
val df: DataFrame = data.map(Tuple1.apply).toDF("features") val Row(coeff1: Matrix): Row = Correlation.corr(df, "features").head println(s"Pearson correlation matrix:\n $coeff1, ${coeff1.getClass}")
val Row(coeff2: Matrix): Row = Correlation.corr(df, "features", "spearman").head println(s"Spearman correlation matrix:\n $coeff2")
val data2: Seq[(Double, linalg.Vector)] = Seq( (0.0, Vectors.dense(0.5, 10.0)), (0.0, Vectors.dense(1.5, 20.0)), (1.0, Vectors.dense(1.5, 30.0)), (0.0, Vectors.dense(3.5, 30.0)), (0.0, Vectors.dense(3.5, 40.0)), (1.0, Vectors.dense(3.5, 40.0)) )
val df2: DataFrame = data2.toDF("label", "features") val chi: Row = ChiSquareTest.test(df2, "features", "label").head
println(s"pValues = ${chi.getAs[Vector](0)}") println(s"degreesOfFreedom ${chi.getSeq[Int](1).mkString("[", ",", "]")}") println(s"statistics ${chi.getAs[Vector](2)}")
} }
|