Spark2.X分布式弹性数据集_spark

Spark2.X分布式弹性数据集_jar_02

 

Spark2.X分布式弹性数据集_jar_03

 

 

跑一下这个结果

Spark2.X分布式弹性数据集_spark_04

 

 

参考代码

package com.spark.test

import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
object Test {

def main(args: Array[String]): Unit = {
val spark= SparkSession
.builder
.master("local[2]")
.appName("HdfsTest")
.getOrCreate()

//val filePart = args(0)
val filePart = "E://Mycode/datas/stu.txt"
// val rdd= spark.sparkContext.textFile(filePart)
// val lines= rdd.flatMap(x => x.split(" ")).map(x=>(x,1)).reduceByKey((a,b)=>(a+b)).collect().toList
// println(lines)

import spark.implicits._
val dataSet= spark.read.textFile(filePart)
.flatMap(x => x.split(" "))
.map(x=>(x,1)).groupBy("_1").count()
//.show()

dataSet.printSchema()

}
}

 

 

运行结果

Spark2.X分布式弹性数据集_maven_05

 

E:\software\jdk1.8\bin\java "-javaagent:E:\software\IDEA\IntelliJ IDEA 2017.2.6\lib\idea_rt.jar=64316:E:\software\IDEA\IntelliJ IDEA 2017.2.6\bin" -Dfile.encoding=UTF-8 -classpath E:\software\jdk1.8\jre\lib\charsets.jar;E:\software\jdk1.8\jre\lib\deploy.jar;E:\software\jdk1.8\jre\lib\ext\access-bridge-64.jar;E:\software\jdk1.8\jre\lib\ext\cldrdata.jar;E:\software\jdk1.8\jre\lib\ext\dnsns.jar;E:\software\jdk1.8\jre\lib\ext\jaccess.jar;E:\software\jdk1.8\jre\lib\ext\jfxrt.jar;E:\software\jdk1.8\jre\lib\ext\localedata.jar;E:\software\jdk1.8\jre\lib\ext\nashorn.jar;E:\software\jdk1.8\jre\lib\ext\sunec.jar;E:\software\jdk1.8\jre\lib\ext\sunjce_provider.jar;E:\software\jdk1.8\jre\lib\ext\sunmscapi.jar;E:\software\jdk1.8\jre\lib\ext\sunpkcs11.jar;E:\software\jdk1.8\jre\lib\ext\zipfs.jar;E:\software\jdk1.8\jre\lib\javaws.jar;E:\software\jdk1.8\jre\lib\jce.jar;E:\software\jdk1.8\jre\lib\jfr.jar;E:\software\jdk1.8\jre\lib\jfxswt.jar;E:\software\jdk1.8\jre\lib\jsse.jar;E:\software\jdk1.8\jre\lib\management-agent.jar;E:\software\jdk1.8\jre\lib\plugin.jar;E:\software\jdk1.8\jre\lib\resources.jar;E:\software\jdk1.8\jre\lib\rt.jar;E:\Mycode\SparkStu\target\classes;E:\software\Scala\lib\scala-actors-2.11.0.jar;E:\software\Scala\lib\scala-actors-migration_2.11-1.1.0.jar;E:\software\Scala\lib\scala-library.jar;E:\software\Scala\lib\scala-parser-combinators_2.11-1.0.4.jar;E:\software\Scala\lib\scala-reflect.jar;E:\software\Scala\lib\scala-swing_2.11-1.0.2.jar;E:\software\Scala\lib\scala-xml_2.11-1.0.4.jar;E:\software\maven3.3.9\repository\org\apache\spark\spark-core_2.11\2.2.0\spark-core_2.11-2.2.0.jar;E:\software\maven3.3.9\repository\org\apache\avro\avro\1.7.7\avro-1.7.7.jar;E:\software\maven3.3.9\repository\org\codehaus\jackson\jackson-core-asl\1.9.13\jackson-core-asl-1.9.13.jar;E:\software\maven3.3.9\repository\com\thoughtworks\paranamer\paranamer\2.3\paranamer-2.3.jar;E:\software\maven3.3.9\repository\org\apache\commons\commons-compress\1.4.1\commons-compress-1.4.1.jar;E:\software\maven3.3.9\repository\org\tukaani\xz\1.0\xz-1.0.jar;E:\software\maven3.3.9\repository\org\apache\avro\avro-mapred\1.7.7\avro-mapred-1.7.7-hadoop2.jar;E:\software\maven3.3.9\repository\org\apache\avro\avro-ipc\1.7.7\avro-ipc-1.7.7.jar;E:\software\maven3.3.9\repository\org\apache\avro\avro-ipc\1.7.7\avro-ipc-1.7.7-tests.jar;E:\software\maven3.3.9\repository\com\twitter\chill_2.11\0.8.0\chill_2.11-0.8.0.jar;E:\software\maven3.3.9\repository\com\esotericsoftware\kryo-shaded\3.0.3\kryo-shaded-3.0.3.jar;E:\software\maven3.3.9\repository\com\esotericsoftware\minlog\1.3.0\minlog-1.3.0.jar;E:\software\maven3.3.9\repository\org\objenesis\objenesis\2.1\objenesis-2.1.jar;E:\software\maven3.3.9\repository\com\twitter\chill-java\0.8.0\chill-java-0.8.0.jar;E:\software\maven3.3.9\repository\org\apache\xbean\xbean-asm5-shaded\4.4\xbean-asm5-shaded-4.4.jar;E:\software\maven3.3.9\repository\org\apache\spark\spark-launcher_2.11\2.2.0\spark-launcher_2.11-2.2.0.jar;E:\software\maven3.3.9\repository\org\apache\spark\spark-network-common_2.11\2.2.0\spark-network-common_2.11-2.2.0.jar;E:\software\maven3.3.9\repository\org\fusesource\leveldbjni\leveldbjni-all\1.8\leveldbjni-all-1.8.jar;E:\software\maven3.3.9\repository\com\fasterxml\jackson\core\jackson-annotations\2.6.5\jackson-annotations-2.6.5.jar;E:\software\maven3.3.9\repository\org\apache\spark\spark-network-shuffle_2.11\2.2.0\spark-network-shuffle_2.11-2.2.0.jar;E:\software\maven3.3.9\repository\org\apache\spark\spark-unsafe_2.11\2.2.0\spark-unsafe_2.11-2.2.0.jar;E:\software\maven3.3.9\repository\net\java\dev\jets3t\jets3t\0.9.3\jets3t-0.9.3.jar;E:\software\maven3.3.9\repository\org\apache\httpcomponents\httpcore\4.3.3\httpcore-4.3.3.jar;E:\software\maven3.3.9\repository\javax\activation\activation\1.1.1\activation-1.1.1.jar;E:\software\maven3.3.9\repository\mx4j\mx4j\3.0.2\mx4j-3.0.2.jar;E:\software\maven3.3.9\repository\javax\mail\mail\1.4.7\mail-1.4.7.jar;E:\software\maven3.3.9\repository\org\bouncycastle\bcprov-jdk15on\1.51\bcprov-jdk15on-1.51.jar;E:\software\maven3.3.9\repository\com\jamesmurty\utils\java-xmlbuilder\1.0\java-xmlbuilder-1.0.jar;E:\software\maven3.3.9\repository\net\iharder\base64\2.3.8\base64-2.3.8.jar;E:\software\maven3.3.9\repository\org\apache\curator\curator-recipes\2.6.0\curator-recipes-2.6.0.jar;E:\software\maven3.3.9\repository\org\apache\curator\curator-framework\2.6.0\curator-framework-2.6.0.jar;E:\software\maven3.3.9\repository\org\apache\zookeeper\zookeeper\3.4.6\zookeeper-3.4.6.jar;E:\software\maven3.3.9\repository\com\google\guava\guava\16.0.1\guava-16.0.1.jar;E:\software\maven3.3.9\repository\javax\servlet\javax.servlet-api\3.1.0\javax.servlet-api-3.1.0.jar;E:\software\maven3.3.9\repository\org\apache\commons\commons-lang3\3.5\commons-lang3-3.5.jar;E:\software\maven3.3.9\repository\org\apache\commons\commons-math3\3.4.1\commons-math3-3.4.1.jar;E:\software\maven3.3.9\repository\com\google\code\findbugs\jsr305\1.3.9\jsr305-1.3.9.jar;E:\software\maven3.3.9\repository\org\slf4j\slf4j-api\1.7.16\slf4j-api-1.7.16.jar;E:\software\maven3.3.9\repository\org\slf4j\jul-to-slf4j\1.7.16\jul-to-slf4j-1.7.16.jar;E:\software\maven3.3.9\repository\org\slf4j\jcl-over-slf4j\1.7.16\jcl-over-slf4j-1.7.16.jar;E:\software\maven3.3.9\repository\log4j\log4j\1.2.17\log4j-1.2.17.jar;E:\software\maven3.3.9\repository\org\slf4j\slf4j-log4j12\1.7.16\slf4j-log4j12-1.7.16.jar;E:\software\maven3.3.9\repository\com\ning\compress-lzf\1.0.3\compress-lzf-1.0.3.jar;E:\software\maven3.3.9\repository\org\xerial\snappy\snappy-java\1.1.2.6\snappy-java-1.1.2.6.jar;E:\software\maven3.3.9\repository\net\jpountz\lz4\lz4\1.3.0\lz4-1.3.0.jar;E:\software\maven3.3.9\repository\org\roaringbitmap\RoaringBitmap\0.5.11\RoaringBitmap-0.5.11.jar;E:\software\maven3.3.9\repository\commons-net\commons-net\2.2\commons-net-2.2.jar;E:\software\maven3.3.9\repository\org\scala-lang\scala-library\2.11.8\scala-library-2.11.8.jar;E:\software\maven3.3.9\repository\org\json4s\json4s-jackson_2.11\3.2.11\json4s-jackson_2.11-3.2.11.jar;E:\software\maven3.3.9\repository\org\json4s\json4s-core_2.11\3.2.11\json4s-core_2.11-3.2.11.jar;E:\software\maven3.3.9\repository\org\json4s\json4s-ast_2.11\3.2.11\json4s-ast_2.11-3.2.11.jar;E:\software\maven3.3.9\repository\org\scala-lang\scalap\2.11.0\scalap-2.11.0.jar;E:\software\maven3.3.9\repository\org\scala-lang\scala-compiler\2.11.0\scala-compiler-2.11.0.jar;E:\software\maven3.3.9\repository\org\scala-lang\modules\scala-xml_2.11\1.0.1\scala-xml_2.11-1.0.1.jar;E:\software\maven3.3.9\repository\org\glassfish\jersey\core\jersey-client\2.22.2\jersey-client-2.22.2.jar;E:\software\maven3.3.9\repository\javax\ws\rs\javax.ws.rs-api\2.0.1\javax.ws.rs-api-2.0.1.jar;E:\software\maven3.3.9\repository\org\glassfish\hk2\hk2-api\2.4.0-b34\hk2-api-2.4.0-b34.jar;E:\software\maven3.3.9\repository\org\glassfish\hk2\hk2-utils\2.4.0-b34\hk2-utils-2.4.0-b34.jar;E:\software\maven3.3.9\repository\org\glassfish\hk2\external\aopalliance-repackaged\2.4.0-b34\aopalliance-repackaged-2.4.0-b34.jar;E:\software\maven3.3.9\repository\org\glassfish\hk2\external\javax.inject\2.4.0-b34\javax.inject-2.4.0-b34.jar;E:\software\maven3.3.9\repository\org\glassfish\hk2\hk2-locator\2.4.0-b34\hk2-locator-2.4.0-b34.jar;E:\software\maven3.3.9\repository\org\javassist\javassist\3.18.1-GA\javassist-3.18.1-GA.jar;E:\software\maven3.3.9\repository\org\glassfish\jersey\core\jersey-common\2.22.2\jersey-common-2.22.2.jar;E:\software\maven3.3.9\repository\javax\annotation\javax.annotation-api\1.2\javax.annotation-api-1.2.jar;E:\software\maven3.3.9\repository\org\glassfish\jersey\bundles\repackaged\jersey-guava\2.22.2\jersey-guava-2.22.2.jar;E:\software\maven3.3.9\repository\org\glassfish\hk2\osgi-resource-locator\1.0.1\osgi-resource-locator-1.0.1.jar;E:\software\maven3.3.9\repository\org\glassfish\jersey\core\jersey-server\2.22.2\jersey-server-2.22.2.jar;E:\software\maven3.3.9\repository\org\glassfish\jersey\media\jersey-media-jaxb\2.22.2\jersey-media-jaxb-2.22.2.jar;E:\software\maven3.3.9\repository\javax\validation\validation-api\1.1.0.Final\validation-api-1.1.0.Final.jar;E:\software\maven3.3.9\repository\org\glassfish\jersey\containers\jersey-container-servlet\2.22.2\jersey-container-servlet-2.22.2.jar;E:\software\maven3.3.9\repository\org\glassfish\jersey\containers\jersey-container-servlet-core\2.22.2\jersey-container-servlet-core-2.22.2.jar;E:\software\maven3.3.9\repository\io\netty\netty-all\4.0.43.Final\netty-all-4.0.43.Final.jar;E:\software\maven3.3.9\repository\io\netty\netty\3.9.9.Final\netty-3.9.9.Final.jar;E:\software\maven3.3.9\repository\com\clearspring\analytics\stream\2.7.0\stream-2.7.0.jar;E:\software\maven3.3.9\repository\io\dropwizard\metrics\metrics-core\3.1.2\metrics-core-3.1.2.jar;E:\software\maven3.3.9\repository\io\dropwizard\metrics\metrics-jvm\3.1.2\metrics-jvm-3.1.2.jar;E:\software\maven3.3.9\repository\io\dropwizard\metrics\metrics-json\3.1.2\metrics-json-3.1.2.jar;E:\software\maven3.3.9\repository\io\dropwizard\metrics\metrics-graphite\3.1.2\metrics-graphite-3.1.2.jar;E:\software\maven3.3.9\repository\com\fasterxml\jackson\core\jackson-databind\2.6.5\jackson-databind-2.6.5.jar;E:\software\maven3.3.9\repository\com\fasterxml\jackson\core\jackson-core\2.6.5\jackson-core-2.6.5.jar;E:\software\maven3.3.9\repository\com\fasterxml\jackson\module\jackson-module-scala_2.11\2.6.5\jackson-module-scala_2.11-2.6.5.jar;E:\software\maven3.3.9\repository\org\scala-lang\scala-reflect\2.11.7\scala-reflect-2.11.7.jar;E:\software\maven3.3.9\repository\com\fasterxml\jackson\module\jackson-module-paranamer\2.6.5\jackson-module-paranamer-2.6.5.jar;E:\software\maven3.3.9\repository\org\apache\ivy\ivy\2.4.0\ivy-2.4.0.jar;E:\software\maven3.3.9\repository\oro\oro\2.0.8\oro-2.0.8.jar;E:\software\maven3.3.9\repository\net\razorvine\pyrolite\4.13\pyrolite-4.13.jar;E:\software\maven3.3.9\repository\net\sf\py4j\py4j\0.10.4\py4j-0.10.4.jar;E:\software\maven3.3.9\repository\org\apache\spark\spark-tags_2.11\2.2.0\spark-tags_2.11-2.2.0.jar;E:\software\maven3.3.9\repository\org\apache\commons\commons-crypto\1.0.0\commons-crypto-1.0.0.jar;E:\software\maven3.3.9\repository\org\spark-project\spark\unused\1.0.0\unused-1.0.0.jar;E:\software\maven3.3.9\repository\org\apache\spark\spark-sql_2.11\2.2.0\spark-sql_2.11-2.2.0.jar;E:\software\maven3.3.9\repository\com\univocity\univocity-parsers\2.2.1\univocity-parsers-2.2.1.jar;E:\software\maven3.3.9\repository\org\apache\spark\spark-sketch_2.11\2.2.0\spark-sketch_2.11-2.2.0.jar;E:\software\maven3.3.9\repository\org\apache\spark\spark-catalyst_2.11\2.2.0\spark-catalyst_2.11-2.2.0.jar;E:\software\maven3.3.9\repository\org\codehaus\janino\janino\3.0.0\janino-3.0.0.jar;E:\software\maven3.3.9\repository\org\codehaus\janino\commons-compiler\3.0.0\commons-compiler-3.0.0.jar;E:\software\maven3.3.9\repository\org\antlr\antlr4-runtime\4.5.3\antlr4-runtime-4.5.3.jar;E:\software\maven3.3.9\repository\org\apache\parquet\parquet-column\1.8.2\parquet-column-1.8.2.jar;E:\software\maven3.3.9\repository\org\apache\parquet\parquet-common\1.8.2\parquet-common-1.8.2.jar;E:\software\maven3.3.9\repository\org\apache\parquet\parquet-encoding\1.8.2\parquet-encoding-1.8.2.jar;E:\software\maven3.3.9\repository\org\apache\parquet\parquet-hadoop\1.8.2\parquet-hadoop-1.8.2.jar;E:\software\maven3.3.9\repository\org\apache\parquet\parquet-format\2.3.1\parquet-format-2.3.1.jar;E:\software\maven3.3.9\repository\org\apache\parquet\parquet-jackson\1.8.2\parquet-jackson-1.8.2.jar;E:\software\maven3.3.9\repository\org\apache\spark\spark-streaming_2.11\2.2.0\spark-streaming_2.11-2.2.0.jar;E:\software\maven3.3.9\repository\org\apache\spark\spark-hive_2.11\2.2.0\spark-hive_2.11-2.2.0.jar;E:\software\maven3.3.9\repository\com\twitter\parquet-hadoop-bundle\1.6.0\parquet-hadoop-bundle-1.6.0.jar;E:\software\maven3.3.9\repository\org\spark-project\hive\hive-exec\1.2.1.spark2\hive-exec-1.2.1.spark2.jar;E:\software\maven3.3.9\repository\commons-io\commons-io\2.4\commons-io-2.4.jar;E:\software\maven3.3.9\repository\commons-lang\commons-lang\2.6\commons-lang-2.6.jar;E:\software\maven3.3.9\repository\javolution\javolution\5.5.1\javolution-5.5.1.jar;E:\software\maven3.3.9\repository\log4j\apache-log4j-extras\1.2.17\apache-log4j-extras-1.2.17.jar;E:\software\maven3.3.9\repository\org\antlr\antlr-runtime\3.4\antlr-runtime-3.4.jar;E:\software\maven3.3.9\repository\org\antlr\stringtemplate\3.2.1\stringtemplate-3.2.1.jar;E:\software\maven3.3.9\repository\antlr\antlr\2.7.7\antlr-2.7.7.jar;E:\software\maven3.3.9\repository\org\antlr\ST4\4.0.4\ST4-4.0.4.jar;E:\software\maven3.3.9\repository\com\googlecode\javaewah\JavaEWAH\0.3.2\JavaEWAH-0.3.2.jar;E:\software\maven3.3.9\repository\org\iq80\snappy\snappy\0.2\snappy-0.2.jar;E:\software\maven3.3.9\repository\stax\stax-api\1.0.1\stax-api-1.0.1.jar;E:\software\maven3.3.9\repository\net\sf\opencsv\opencsv\2.3\opencsv-2.3.jar;E:\software\maven3.3.9\repository\org\spark-project\hive\hive-metastore\1.2.1.spark2\hive-metastore-1.2.1.spark2.jar;E:\software\maven3.3.9\repository\com\jolbox\bonecp\0.8.0.RELEASE\bonecp-0.8.0.RELEASE.jar;E:\software\maven3.3.9\repository\commons-cli\commons-cli\1.2\commons-cli-1.2.jar;E:\software\maven3.3.9\repository\commons-logging\commons-logging\1.1.3\commons-logging-1.1.3.jar;E:\software\maven3.3.9\repository\org\apache\derby\derby\10.10.2.0\derby-10.10.2.0.jar;E:\software\maven3.3.9\repository\org\datanucleus\datanucleus-api-jdo\3.2.6\datanucleus-api-jdo-3.2.6.jar;E:\software\maven3.3.9\repository\org\datanucleus\datanucleus-rdbms\3.2.9\datanucleus-rdbms-3.2.9.jar;E:\software\maven3.3.9\repository\commons-pool\commons-pool\1.5.4\commons-pool-1.5.4.jar;E:\software\maven3.3.9\repository\commons-dbcp\commons-dbcp\1.4\commons-dbcp-1.4.jar;E:\software\maven3.3.9\repository\javax\jdo\jdo-api\3.0.1\jdo-api-3.0.1.jar;E:\software\maven3.3.9\repository\javax\transaction\jta\1.1\jta-1.1.jar;E:\software\maven3.3.9\repository\commons-httpclient\commons-httpclient\3.1\commons-httpclient-3.1.jar;E:\software\maven3.3.9\repository\org\apache\calcite\calcite-avatica\1.2.0-incubating\calcite-avatica-1.2.0-incubating.jar;E:\software\maven3.3.9\repository\org\apache\calcite\calcite-core\1.2.0-incubating\calcite-core-1.2.0-incubating.jar;E:\software\maven3.3.9\repository\org\apache\calcite\calcite-linq4j\1.2.0-incubating\calcite-linq4j-1.2.0-incubating.jar;E:\software\maven3.3.9\repository\net\hydromatic\eigenbase-properties\1.1.5\eigenbase-properties-1.1.5.jar;E:\software\maven3.3.9\repository\org\apache\httpcomponents\httpclient\4.5.2\httpclient-4.5.2.jar;E:\software\maven3.3.9\repository\org\codehaus\jackson\jackson-mapper-asl\1.9.13\jackson-mapper-asl-1.9.13.jar;E:\software\maven3.3.9\repository\commons-codec\commons-codec\1.10\commons-codec-1.10.jar;E:\software\maven3.3.9\repository\joda-time\joda-time\2.9.3\joda-time-2.9.3.jar;E:\software\maven3.3.9\repository\org\jodd\jodd-core\3.5.2\jodd-core-3.5.2.jar;E:\software\maven3.3.9\repository\org\datanucleus\datanucleus-core\3.2.10\datanucleus-core-3.2.10.jar;E:\software\maven3.3.9\repository\org\apache\thrift\libthrift\0.9.3\libthrift-0.9.3.jar;E:\software\maven3.3.9\repository\org\apache\thrift\libfb303\0.9.3\libfb303-0.9.3.jar;E:\software\maven3.3.9\repository\org\apache\spark\spark-streaming-kafka-0-10_2.11\2.2.0\spark-streaming-kafka-0-10_2.11-2.2.0.jar;E:\software\maven3.3.9\repository\org\apache\kafka\kafka_2.11\0.10.0.1\kafka_2.11-0.10.0.1.jar;E:\software\maven3.3.9\repository\com\101tec\zkclient\0.8\zkclient-0.8.jar;E:\software\maven3.3.9\repository\com\yammer\metrics\metrics-core\2.2.0\metrics-core-2.2.0.jar;E:\software\maven3.3.9\repository\org\scala-lang\modules\scala-parser-combinators_2.11\1.0.4\scala-parser-combinators_2.11-1.0.4.jar;E:\software\maven3.3.9\repository\org\apache\spark\spark-sql-kafka-0-10_2.11\2.2.0\spark-sql-kafka-0-10_2.11-2.2.0.jar;E:\software\maven3.3.9\repository\org\apache\kafka\kafka-clients\0.10.0.1\kafka-clients-0.10.0.1.jar;E:\software\maven3.3.9\repository\org\apache\hadoop\hadoop-client\2.6.0\hadoop-client-2.6.0.jar;E:\software\maven3.3.9\repository\org\apache\hadoop\hadoop-common\2.6.0\hadoop-common-2.6.0.jar;E:\software\maven3.3.9\repository\xmlenc\xmlenc\0.52\xmlenc-0.52.jar;E:\software\maven3.3.9\repository\commons-collections\commons-collections\3.2.1\commons-collections-3.2.1.jar;E:\software\maven3.3.9\repository\commons-configuration\commons-configuration\1.6\commons-configuration-1.6.jar;E:\software\maven3.3.9\repository\commons-digester\commons-digester\1.8\commons-digester-1.8.jar;E:\software\maven3.3.9\repository\commons-beanutils\commons-beanutils\1.7.0\commons-beanutils-1.7.0.jar;E:\software\maven3.3.9\repository\commons-beanutils\commons-beanutils-core\1.8.0\commons-beanutils-core-1.8.0.jar;E:\software\maven3.3.9\repository\com\google\protobuf\protobuf-java\2.5.0\protobuf-java-2.5.0.jar;E:\software\maven3.3.9\repository\com\google\code\gson\gson\2.2.4\gson-2.2.4.jar;E:\software\maven3.3.9\repository\org\apache\hadoop\hadoop-auth\2.6.0\hadoop-auth-2.6.0.jar;E:\software\maven3.3.9\repository\org\apache\directory\server\apacheds-kerberos-codec\2.0.0-M15\apacheds-kerberos-codec-2.0.0-M15.jar;E:\software\maven3.3.9\repository\org\apache\directory\server\apacheds-i18n\2.0.0-M15\apacheds-i18n-2.0.0-M15.jar;E:\software\maven3.3.9\repository\org\apache\directory\api\api-asn1-api\1.0.0-M20\api-asn1-api-1.0.0-M20.jar;E:\software\maven3.3.9\repository\org\apache\directory\api\api-util\1.0.0-M20\api-util-1.0.0-M20.jar;E:\software\maven3.3.9\repository\org\apache\curator\curator-client\2.6.0\curator-client-2.6.0.jar;E:\software\maven3.3.9\repository\org\htrace\htrace-core\3.0.4\htrace-core-3.0.4.jar;E:\software\maven3.3.9\repository\org\apache\hadoop\hadoop-hdfs\2.6.0\hadoop-hdfs-2.6.0.jar;E:\software\maven3.3.9\repository\org\mortbay\jetty\jetty-util\6.1.26\jetty-util-6.1.26.jar;E:\software\maven3.3.9\repository\xerces\xercesImpl\2.9.1\xercesImpl-2.9.1.jar;E:\software\maven3.3.9\repository\xml-apis\xml-apis\1.3.04\xml-apis-1.3.04.jar;E:\software\maven3.3.9\repository\org\apache\hadoop\hadoop-mapreduce-client-app\2.6.0\hadoop-mapreduce-client-app-2.6.0.jar;E:\software\maven3.3.9\repository\org\apache\hadoop\hadoop-mapreduce-client-common\2.6.0\hadoop-mapreduce-client-common-2.6.0.jar;E:\software\maven3.3.9\repository\org\apache\hadoop\hadoop-yarn-client\2.6.0\hadoop-yarn-client-2.6.0.jar;E:\software\maven3.3.9\repository\org\apache\hadoop\hadoop-yarn-server-common\2.6.0\hadoop-yarn-server-common-2.6.0.jar;E:\software\maven3.3.9\repository\org\apache\hadoop\hadoop-mapreduce-client-shuffle\2.6.0\hadoop-mapreduce-client-shuffle-2.6.0.jar;E:\software\maven3.3.9\repository\org\apache\hadoop\hadoop-yarn-api\2.6.0\hadoop-yarn-api-2.6.0.jar;E:\software\maven3.3.9\repository\org\apache\hadoop\hadoop-mapreduce-client-core\2.6.0\hadoop-mapreduce-client-core-2.6.0.jar;E:\software\maven3.3.9\repository\org\apache\hadoop\hadoop-yarn-common\2.6.0\hadoop-yarn-common-2.6.0.jar;E:\software\maven3.3.9\repository\javax\xml\bind\jaxb-api\2.2.2\jaxb-api-2.2.2.jar;E:\software\maven3.3.9\repository\javax\xml\stream\stax-api\1.0-2\stax-api-1.0-2.jar;E:\software\maven3.3.9\repository\javax\servlet\servlet-api\2.5\servlet-api-2.5.jar;E:\software\maven3.3.9\repository\com\sun\jersey\jersey-core\1.9\jersey-core-1.9.jar;E:\software\maven3.3.9\repository\com\sun\jersey\jersey-client\1.9\jersey-client-1.9.jar;E:\software\maven3.3.9\repository\org\codehaus\jackson\jackson-jaxrs\1.9.13\jackson-jaxrs-1.9.13.jar;E:\software\maven3.3.9\repository\org\codehaus\jackson\jackson-xc\1.9.13\jackson-xc-1.9.13.jar;E:\software\maven3.3.9\repository\org\apache\hadoop\hadoop-mapreduce-client-jobclient\2.6.0\hadoop-mapreduce-client-jobclient-2.6.0.jar;E:\software\maven3.3.9\repository\org\apache\hadoop\hadoop-annotations\2.6.0\hadoop-annotations-2.6.0.jar com.spark.test.Test
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
18/03/16 21:49:16 INFO SparkContext: Running Spark version 2.2.0
18/03/16 21:49:17 INFO SparkContext: Submitted application: HdfsTest
18/03/16 21:49:17 INFO SecurityManager: Changing view acls to: Brave
18/03/16 21:49:17 INFO SecurityManager: Changing modify acls to: Brave
18/03/16 21:49:17 INFO SecurityManager: Changing view acls groups to:
18/03/16 21:49:17 INFO SecurityManager: Changing modify acls groups to:
18/03/16 21:49:17 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(Brave); groups with view permissions: Set(); users with modify permissions: Set(Brave); groups with modify permissions: Set()
18/03/16 21:49:18 INFO Utils: Successfully started service 'sparkDriver' on port 64353.
18/03/16 21:49:18 INFO SparkEnv: Registering MapOutputTracker
18/03/16 21:49:18 INFO SparkEnv: Registering BlockManagerMaster
18/03/16 21:49:18 INFO BlockManagerMasterEndpoint: Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
18/03/16 21:49:18 INFO BlockManagerMasterEndpoint: BlockManagerMasterEndpoint up
18/03/16 21:49:18 INFO DiskBlockManager: Created local directory at C:\Users\Brave\AppData\Local\Temp\blockmgr-8a40eb8b-fed0-42b1-a1a6-f8f4e58d2f9a
18/03/16 21:49:18 INFO MemoryStore: MemoryStore started with capacity 1998.3 MB
18/03/16 21:49:18 INFO SparkEnv: Registering OutputCommitCoordinator
18/03/16 21:49:19 INFO Utils: Successfully started service 'SparkUI' on port 4040.
18/03/16 21:49:19 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://192.168.56.1:4040
18/03/16 21:49:19 INFO Executor: Starting executor ID driver on host localhost
18/03/16 21:49:19 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 64370.
18/03/16 21:49:19 INFO NettyBlockTransferService: Server created on 192.168.56.1:64370
18/03/16 21:49:19 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
18/03/16 21:49:19 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, 192.168.56.1, 64370, None)
18/03/16 21:49:19 INFO BlockManagerMasterEndpoint: Registering block manager 192.168.56.1:64370 with 1998.3 MB RAM, BlockManagerId(driver, 192.168.56.1, 64370, None)
18/03/16 21:49:19 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, 192.168.56.1, 64370, None)
18/03/16 21:49:19 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, 192.168.56.1, 64370, None)
18/03/16 21:49:19 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir ('file:/E:/Mycode/SparkStu/spark-warehouse/').
18/03/16 21:49:19 INFO SharedState: Warehouse path is 'file:/E:/Mycode/SparkStu/spark-warehouse/'.
18/03/16 21:49:20 INFO StateStoreCoordinatorRef: Registered StateStoreCoordinator endpoint
root
|-- _1: string (nullable = true)
|-- count: long (nullable = false)

18/03/16 21:49:23 INFO SparkContext: Invoking stop() from shutdown hook
18/03/16 21:49:23 INFO SparkUI: Stopped Spark web UI at http://192.168.56.1:4040
18/03/16 21:49:23 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
18/03/16 21:49:23 INFO MemoryStore: MemoryStore cleared
18/03/16 21:49:23 INFO BlockManager: BlockManager stopped
18/03/16 21:49:23 INFO BlockManagerMaster: BlockManagerMaster stopped
18/03/16 21:49:23 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
18/03/16 21:49:23 INFO SparkContext: Successfully stopped SparkContext
18/03/16 21:49:23 INFO ShutdownHookManager: Shutdown hook called
18/03/16 21:49:23 INFO ShutdownHookManager: Deleting directory C:\Users\Brave\AppData\Local\Temp\spark-f360cd01-2835-4aad-a1db-56f167e10d42

Process finished with exit code 0

 

 

改一下这里

Spark2.X分布式弹性数据集_maven_06

 

下面是运行结果

Spark2.X分布式弹性数据集_maven_07

 

 

 

 

Spark2.X分布式弹性数据集_jar_08

 

 

Spark2.X分布式弹性数据集_maven_09

 

 Spark2.X分布式弹性数据集_jar_10

打开这个地址

Spark2.X分布式弹性数据集_spark_11

 

 

我们来产生一个job

Spark2.X分布式弹性数据集_spark_12

 

 Spark2.X分布式弹性数据集_maven_13

 

 

 Spark2.X分布式弹性数据集_maven_14

 

 

 Spark2.X分布式弹性数据集_jar_15

 Spark2.X分布式弹性数据集_spark_16

 

 Spark2.X分布式弹性数据集_maven_17

Spark2.X分布式弹性数据集_jar_18

 

 Spark2.X分布式弹性数据集_spark_19

 

 Spark2.X分布式弹性数据集_maven_20

 

 Spark2.X分布式弹性数据集_jar_21

 

 

Spark2.X分布式弹性数据集_jar_22

 

 

Spark2.X分布式弹性数据集_maven_23

 

Spark2.X分布式弹性数据集_jar_24

 

 Spark2.X分布式弹性数据集_spark_25

 

从监控页面我们可以看到

Spark2.X分布式弹性数据集_spark_26

Spark2.X分布式弹性数据集_spark_27

Spark2.X分布式弹性数据集_jar_28

Spark2.X分布式弹性数据集_jar_29

 

 

 

 Spark2.X分布式弹性数据集_spark_30

Spark2.X分布式弹性数据集_maven_31

 

Spark2.X分布式弹性数据集_jar_32

 

 

 Spark2.X分布式弹性数据集_spark_33

 

 

 Spark2.X分布式弹性数据集_jar_34

 

 Spark2.X分布式弹性数据集_spark_35

 

随便敲一些单词用作测试

 Spark2.X分布式弹性数据集_spark_36

 

 Spark2.X分布式弹性数据集_maven_37

 

 

scala> val rdd1=sc.textFile("file:///opt/datas/stu.txt")
18/03/17 18:16:55 INFO MemoryStore: Block broadcast_3 stored as values in memory (estimated size 229.9 KB, free 413.5 MB)
18/03/17 18:16:55 INFO MemoryStore: Block broadcast_3_piece0 stored as bytes in memory (estimated size 21.4 KB, free 413.4 MB)
18/03/17 18:16:55 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on 192.168.86.151:48324 (size: 21.4 KB, free: 413.9 MB)
18/03/17 18:16:55 INFO SparkContext: Created broadcast 3 from textFile at <console>:25
rdd1: org.apache.spark.rdd.RDD[String] = file:///opt/datas/stu.txt MapPartitionsRDD[5] at textFile at <console>:25

scala> val rdd2=sc.textFile("file:///opt/datas/stu1.txt")
18/03/17 18:17:04 INFO MemoryStore: Block broadcast_4 stored as values in memory (estimated size 229.9 KB, free 413.2 MB)
18/03/17 18:17:05 INFO MemoryStore: Block broadcast_4_piece0 stored as bytes in memory (estimated size 21.4 KB, free 413.2 MB)
18/03/17 18:17:05 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on 192.168.86.151:48324 (size: 21.4 KB, free: 413.9 MB)
18/03/17 18:17:05 INFO SparkContext: Created broadcast 4 from textFile at <console>:25
rdd2: org.apache.spark.rdd.RDD[String] = file:///opt/datas/stu1.txt MapPartitionsRDD[7] at textFile at <console>:25

scala> val rdd=rdd1.union(rdd2)
rdd: org.apache.spark.rdd.RDD[String] = UnionRDD[8] at union at <console>:29

scala>

 

 

 

Spark2.X分布式弹性数据集_spark_38

 

Spark2.X分布式弹性数据集_jar_39

 

val lines = rdd.flatMap(x=>x.split(" ")).map(x=>(x,1)).collect

 

 

 

 

Spark2.X分布式弹性数据集_maven_40

Spark2.X分布式弹性数据集_maven_41

 

val lines = rdd.flatMap(x=>x.split(" ")).map(x=>(x,1)).reduceByKey((a,b)=>(a+b)).collect

 

 

 

Spark2.X分布式弹性数据集_spark_42

 

 Spark2.X分布式弹性数据集_spark_43

 

val lines = rdd.flatMap(x=>x.split(" ")).map(x=>(x,1)).reduceByKey((a,b)=>(a+b)).filter(x => (x._2>1)).collect

 

 

Spark2.X分布式弹性数据集_jar_44

 

 

 Spark2.X分布式弹性数据集_maven_45

 

 

val lines = rdd.flatMap(x=>x.split(" ")).map(x=>(x,1)).reduceByKey((a,b)=>(a+b)).filter(x => (x._2>1)).map(x=>(x._2,x._1)).collect

 

 

 

 

Spark2.X分布式弹性数据集_maven_46

Spark2.X分布式弹性数据集_jar_47

 

val lines = rdd.flatMap(x=>x.split(" ")).map(x=>(x,1)).reduceByKey((a,b)=>(a+b)).filter(x => (x._2>1)).map(x=>(x._2,x._1)).sortByKey().collect

 

 

Spark2.X分布式弹性数据集_maven_48

 

 Spark2.X分布式弹性数据集_jar_49

 

val lines = rdd.flatMap(x=>x.split(" ")).map(x=>(x,1)).reduceByKey((a,b)=>(a+b)).filter(x => (x._2>1)).map(x=>(x._2,x._1)).sortByKey(false).collect

 

 

 

Spark2.X分布式弹性数据集_spark_50

 Spark2.X分布式弹性数据集_maven_51

val lines = rdd.flatMap(x=>x.split(" ")).map(x=>(x,1)).reduceByKey((a,b)=>(a+b)).filter(x => (x._2>1)).map(x=>(x._2,x._1)).sortByKey(false).map(x=>(x._2,x._1)).collect

 

 

 

Spark2.X分布式弹性数据集_maven_52

 Spark2.X分布式弹性数据集_jar_53

 

 

 

Spark2.X分布式弹性数据集_spark_54

 

 因为我目前的节点2的hdfs是active状态

Spark2.X分布式弹性数据集_jar_55

可以看到报错了,文件目录已经存在了,我们必须要是一个不存在的路径才可以

 

 

Spark2.X分布式弹性数据集_spark_56

 

 Spark2.X分布式弹性数据集_maven_57

可以看到成功了

 

我们查看一下hdfs的输出目录

Spark2.X分布式弹性数据集_maven_58

 

 

 我们退出spark-shell,再重新进入

直接把我们前面的重新执行一次

val rdd1=sc.textFile("file:///opt/datas/stu.txt")
val rdd2=sc.textFile("file:///opt/datas/stu1.txt")
val rdd=rdd1.union(rdd2)
val lines = rdd.flatMap(x=>x.split(" ")).map(x=>(x,1)).reduceByKey((a,b)=>(a+b)).filter(x => (x._2>1)).map(x=>(x._2,x._1)).sortByKey(false).map(x=>(x._2,x._1))

 

 

执行完了之后就

Spark2.X分布式弹性数据集_spark_59

 

 Spark2.X分布式弹性数据集_jar_60

 

 

scala> lines.foreach(println)
18/03/17 19:51:06 INFO BlockManagerInfo: Removed broadcast_2_piece0 on 192.168.86.151:55570 in memory (size: 3.2 KB, free: 413.9 MB)
18/03/17 19:51:06 INFO BlockManagerInfo: Removed broadcast_3_piece0 on 192.168.86.151:55570 in memory (size: 2.4 KB, free: 413.9 MB)
18/03/17 19:51:06 INFO SparkContext: Starting job: foreach at <console>:33
18/03/17 19:51:06 INFO MapOutputTrackerMaster: Size of output statuses for shuffle 0 is 162 bytes
18/03/17 19:51:06 INFO DAGScheduler: Registering RDD 9 (map at <console>:30)
18/03/17 19:51:06 INFO DAGScheduler: Got job 1 (foreach at <console>:33) with 2 output partitions
18/03/17 19:51:06 INFO DAGScheduler: Final stage: ResultStage 4 (foreach at <console>:33)
18/03/17 19:51:06 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 3)
18/03/17 19:51:06 INFO DAGScheduler: Missing parents: List(ShuffleMapStage 3)
18/03/17 19:51:06 INFO DAGScheduler: Submitting ShuffleMapStage 3 (MapPartitionsRDD[9] at map at <console>:30), which has no missing parents
18/03/17 19:51:07 INFO MemoryStore: Block broadcast_4 stored as values in memory (estimated size 4.3 KB, free 413.4 MB)
18/03/17 19:51:07 INFO MemoryStore: Block broadcast_4_piece0 stored as bytes in memory (estimated size 2.5 KB, free 413.4 MB)
18/03/17 19:51:07 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on 192.168.86.151:55570 (size: 2.5 KB, free: 413.9 MB)
18/03/17 19:51:07 INFO SparkContext: Created broadcast 4 from broadcast at DAGScheduler.scala:1006
18/03/17 19:51:07 INFO DAGScheduler: Submitting 2 missing tasks from ShuffleMapStage 3 (MapPartitionsRDD[9] at map at <console>:30) (first 15 tasks are for partitions Vector(0, 1))
18/03/17 19:51:07 INFO TaskSchedulerImpl: Adding task set 3.0 with 2 tasks
18/03/17 19:51:07 INFO TaskSetManager: Starting task 0.0 in stage 3.0 (TID 4, localhost, executor driver, partition 0, ANY, 4610 bytes)
18/03/17 19:51:07 INFO Executor: Running task 0.0 in stage 3.0 (TID 4)
18/03/17 19:51:07 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/17 19:51:07 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms
18/03/17 19:51:07 INFO Executor: Finished task 0.0 in stage 3.0 (TID 4). 1285 bytes result sent to driver
18/03/17 19:51:07 INFO TaskSetManager: Starting task 1.0 in stage 3.0 (TID 5, localhost, executor driver, partition 1, ANY, 4610 bytes)
18/03/17 19:51:07 INFO TaskSetManager: Finished task 0.0 in stage 3.0 (TID 4) in 115 ms on localhost (executor driver) (1/2)
18/03/17 19:51:07 INFO Executor: Running task 1.0 in stage 3.0 (TID 5)
18/03/17 19:51:07 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/17 19:51:07 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 10 ms
18/03/17 19:51:07 INFO Executor: Finished task 1.0 in stage 3.0 (TID 5). 1285 bytes result sent to driver
18/03/17 19:51:07 INFO TaskSetManager: Finished task 1.0 in stage 3.0 (TID 5) in 118 ms on localhost (executor driver) (2/2)
18/03/17 19:51:07 INFO TaskSchedulerImpl: Removed TaskSet 3.0, whose tasks have all completed, from pool
18/03/17 19:51:07 INFO DAGScheduler: ShuffleMapStage 3 (map at <console>:30) finished in 0.227 s
18/03/17 19:51:07 INFO DAGScheduler: looking for newly runnable stages
18/03/17 19:51:07 INFO DAGScheduler: running: Set()
18/03/17 19:51:07 INFO DAGScheduler: waiting: Set(ResultStage 4)
18/03/17 19:51:07 INFO DAGScheduler: failed: Set()
18/03/17 19:51:07 INFO DAGScheduler: Submitting ResultStage 4 (MapPartitionsRDD[13] at map at <console>:30), which has no missing parents
18/03/17 19:51:07 INFO MemoryStore: Block broadcast_5 stored as values in memory (estimated size 3.9 KB, free 413.4 MB)
18/03/17 19:51:07 INFO MemoryStore: Block broadcast_5_piece0 stored as bytes in memory (estimated size 2.3 KB, free 413.4 MB)
18/03/17 19:51:07 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on 192.168.86.151:55570 (size: 2.3 KB, free: 413.9 MB)
18/03/17 19:51:07 INFO SparkContext: Created broadcast 5 from broadcast at DAGScheduler.scala:1006
18/03/17 19:51:07 INFO DAGScheduler: Submitting 2 missing tasks from ResultStage 4 (MapPartitionsRDD[13] at map at <console>:30) (first 15 tasks are for partitions Vector(0, 1))
18/03/17 19:51:07 INFO TaskSchedulerImpl: Adding task set 4.0 with 2 tasks
18/03/17 19:51:07 INFO TaskSetManager: Starting task 0.0 in stage 4.0 (TID 6, localhost, executor driver, partition 0, ANY, 4621 bytes)
18/03/17 19:51:07 INFO Executor: Running task 0.0 in stage 4.0 (TID 6)
18/03/17 19:51:07 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/17 19:51:07 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
(mysql,9)
(hive,8)
(hadoop,6)
(she,4)
(hello,4)
(node,3)
18/03/17 19:51:07 INFO Executor: Finished task 0.0 in stage 4.0 (TID 6). 1138 bytes result sent to driver
18/03/17 19:51:07 INFO TaskSetManager: Starting task 1.0 in stage 4.0 (TID 7, localhost, executor driver, partition 1, ANY, 4621 bytes)
18/03/17 19:51:07 INFO TaskSetManager: Finished task 0.0 in stage 4.0 (TID 6) in 97 ms on localhost (executor driver) (1/2)
18/03/17 19:51:07 INFO Executor: Running task 1.0 in stage 4.0 (TID 7)
18/03/17 19:51:07 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/17 19:51:07 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
(eggg,2)
(lele,2)
(bag,2)
(haad,2)
(kjkd,2)
(tom,2)
(who,2)
(pig,2)
(jkji,2)
(word,2)
(kljk,2)
(sqoop,2)
(dag,2)
(me,2)
(kjfklds,2)
(you,2)
(whast,2)
(he,2)
(kjdskf,2)
(jdifj,2)
(jack,2)
(where,2)
(take,2)
(hjsdhj,2)
(kjskldjf,2)
(spatk,2)
(heeef,2)
(spring,2)
(hbase,2)
18/03/17 19:51:07 INFO Executor: Finished task 1.0 in stage 4.0 (TID 7). 1095 bytes result sent to driver
18/03/17 19:51:07 INFO TaskSetManager: Finished task 1.0 in stage 4.0 (TID 7) in 63 ms on localhost (executor driver) (2/2)
18/03/17 19:51:07 INFO DAGScheduler: ResultStage 4 (foreach at <console>:33) finished in 0.155 s
18/03/17 19:51:07 INFO TaskSchedulerImpl: Removed TaskSet 4.0, whose tasks have all completed, from pool
18/03/17 19:51:07 INFO DAGScheduler: Job 1 finished: foreach at <console>:33, took 0.509656 s

scala>

 

 

我们可以发现出来的这个结果数据长得很恶心啊

Spark2.X分布式弹性数据集_spark_61

也就是说我们的这个rdd有两个分区

 

我们来指定分区数目

Spark2.X分布式弹性数据集_maven_62

 

 

 Spark2.X分布式弹性数据集_maven_63

 

 Spark2.X分布式弹性数据集_jar_64

 

 

scala> res2.foreach(println)
18/03/17 19:58:31 INFO BlockManagerInfo: Removed broadcast_4_piece0 on 192.168.86.151:55570 in memory (size: 2.5 KB, free: 413.9 MB)
18/03/17 19:58:31 INFO BlockManagerInfo: Removed broadcast_5_piece0 on 192.168.86.151:55570 in memory (size: 2.3 KB, free: 413.9 MB)
18/03/17 19:58:31 INFO SparkContext: Starting job: foreach at <console>:35
18/03/17 19:58:31 INFO MapOutputTrackerMaster: Size of output statuses for shuffle 0 is 162 bytes
18/03/17 19:58:31 INFO MapOutputTrackerMaster: Size of output statuses for shuffle 1 is 162 bytes
18/03/17 19:58:31 INFO DAGScheduler: Registering RDD 14 (repartition at <console>:33)
18/03/17 19:58:31 INFO DAGScheduler: Got job 2 (foreach at <console>:35) with 1 output partitions
18/03/17 19:58:31 INFO DAGScheduler: Final stage: ResultStage 8 (foreach at <console>:35)
18/03/17 19:58:31 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 7)
18/03/17 19:58:31 INFO DAGScheduler: Missing parents: List(ShuffleMapStage 7)
18/03/17 19:58:31 INFO DAGScheduler: Submitting ShuffleMapStage 7 (MapPartitionsRDD[14] at repartition at <console>:33), which has no missing parents
18/03/17 19:58:31 INFO MemoryStore: Block broadcast_6 stored as values in memory (estimated size 4.3 KB, free 413.4 MB)
18/03/17 19:58:31 INFO MemoryStore: Block broadcast_6_piece0 stored as bytes in memory (estimated size 2.5 KB, free 413.4 MB)
18/03/17 19:58:31 INFO BlockManagerInfo: Added broadcast_6_piece0 in memory on 192.168.86.151:55570 (size: 2.5 KB, free: 413.9 MB)
18/03/17 19:58:31 INFO SparkContext: Created broadcast 6 from broadcast at DAGScheduler.scala:1006
18/03/17 19:58:31 INFO DAGScheduler: Submitting 2 missing tasks from ShuffleMapStage 7 (MapPartitionsRDD[14] at repartition at <console>:33) (first 15 tasks are for partitions Vector(0, 1))
18/03/17 19:58:31 INFO TaskSchedulerImpl: Adding task set 7.0 with 2 tasks
18/03/17 19:58:31 INFO TaskSetManager: Starting task 0.0 in stage 7.0 (TID 8, localhost, executor driver, partition 0, ANY, 4610 bytes)
18/03/17 19:58:31 INFO Executor: Running task 0.0 in stage 7.0 (TID 8)
18/03/17 19:58:31 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/17 19:58:31 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms
18/03/17 19:58:31 INFO Executor: Finished task 0.0 in stage 7.0 (TID 8). 1284 bytes result sent to driver
18/03/17 19:58:31 INFO TaskSetManager: Starting task 1.0 in stage 7.0 (TID 9, localhost, executor driver, partition 1, ANY, 4610 bytes)
18/03/17 19:58:31 INFO TaskSetManager: Finished task 0.0 in stage 7.0 (TID 8) in 93 ms on localhost (executor driver) (1/2)
18/03/17 19:58:31 INFO Executor: Running task 1.0 in stage 7.0 (TID 9)
18/03/17 19:58:31 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/17 19:58:31 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 6 ms
18/03/17 19:58:31 INFO Executor: Finished task 1.0 in stage 7.0 (TID 9). 1284 bytes result sent to driver
18/03/17 19:58:31 INFO TaskSetManager: Finished task 1.0 in stage 7.0 (TID 9) in 65 ms on localhost (executor driver) (2/2)
18/03/17 19:58:31 INFO TaskSchedulerImpl: Removed TaskSet 7.0, whose tasks have all completed, from pool
18/03/17 19:58:31 INFO DAGScheduler: ShuffleMapStage 7 (repartition at <console>:33) finished in 0.157 s
18/03/17 19:58:31 INFO DAGScheduler: looking for newly runnable stages
18/03/17 19:58:31 INFO DAGScheduler: running: Set()
18/03/17 19:58:31 INFO DAGScheduler: waiting: Set(ResultStage 8)
18/03/17 19:58:31 INFO DAGScheduler: failed: Set()
18/03/17 19:58:31 INFO DAGScheduler: Submitting ResultStage 8 (MapPartitionsRDD[17] at repartition at <console>:33), which has no missing parents
18/03/17 19:58:31 INFO MemoryStore: Block broadcast_7 stored as values in memory (estimated size 3.3 KB, free 413.4 MB)
18/03/17 19:58:31 INFO MemoryStore: Block broadcast_7_piece0 stored as bytes in memory (estimated size 1934.0 B, free 413.4 MB)
18/03/17 19:58:31 INFO BlockManagerInfo: Added broadcast_7_piece0 in memory on 192.168.86.151:55570 (size: 1934.0 B, free: 413.9 MB)
18/03/17 19:58:31 INFO SparkContext: Created broadcast 7 from broadcast at DAGScheduler.scala:1006
18/03/17 19:58:31 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 8 (MapPartitionsRDD[17] at repartition at <console>:33) (first 15 tasks are for partitions Vector(0))
18/03/17 19:58:31 INFO TaskSchedulerImpl: Adding task set 8.0 with 1 tasks
18/03/17 19:58:31 INFO TaskSetManager: Starting task 0.0 in stage 8.0 (TID 10, localhost, executor driver, partition 0, ANY, 4897 bytes)
18/03/17 19:58:31 INFO Executor: Running task 0.0 in stage 8.0 (TID 10)
18/03/17 19:58:31 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/17 19:58:31 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
(mysql,9)
(hive,8)
(hadoop,6)
(she,4)
(hello,4)
(node,3)
(eggg,2)
(lele,2)
(bag,2)
(haad,2)
(kjkd,2)
(tom,2)
(who,2)
(pig,2)
(jkji,2)
(word,2)
(kljk,2)
(sqoop,2)
(dag,2)
(me,2)
(kjfklds,2)
(you,2)
(whast,2)
(he,2)
(kjdskf,2)
(jdifj,2)
(jack,2)
(where,2)
(take,2)
(hjsdhj,2)
(kjskldjf,2)
(spatk,2)
(heeef,2)
(spring,2)
(hbase,2)
18/03/17 19:58:32 INFO Executor: Finished task 0.0 in stage 8.0 (TID 10). 966 bytes result sent to driver
18/03/17 19:58:32 INFO TaskSetManager: Finished task 0.0 in stage 8.0 (TID 10) in 51 ms on localhost (executor driver) (1/1)
18/03/17 19:58:32 INFO TaskSchedulerImpl: Removed TaskSet 8.0, whose tasks have all completed, from pool
18/03/17 19:58:32 INFO DAGScheduler: ResultStage 8 (foreach at <console>:35) finished in 0.049 s
18/03/17 19:58:32 INFO DAGScheduler: Job 2 finished: foreach at <console>:35, took 0.288514 s

scala>

 

 

其实我们可以搞多个分区的

Spark2.X分布式弹性数据集_spark_65

 

 Spark2.X分布式弹性数据集_maven_66

 

 

scala> lines.repartition(3)
res4: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[21] at repartition at <console>:33

scala> res4.foreach(println)
18/03/17 20:01:53 INFO SparkContext: Starting job: foreach at <console>:35
18/03/17 20:01:53 INFO MapOutputTrackerMaster: Size of output statuses for shuffle 0 is 162 bytes
18/03/17 20:01:53 INFO MapOutputTrackerMaster: Size of output statuses for shuffle 1 is 162 bytes
18/03/17 20:01:53 INFO DAGScheduler: Registering RDD 18 (repartition at <console>:33)
18/03/17 20:01:53 INFO DAGScheduler: Got job 3 (foreach at <console>:35) with 3 output partitions
18/03/17 20:01:53 INFO DAGScheduler: Final stage: ResultStage 12 (foreach at <console>:35)
18/03/17 20:01:53 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 11)
18/03/17 20:01:53 INFO DAGScheduler: Missing parents: List(ShuffleMapStage 11)
18/03/17 20:01:53 INFO DAGScheduler: Submitting ShuffleMapStage 11 (MapPartitionsRDD[18] at repartition at <console>:33), which has no missing parents
18/03/17 20:01:53 INFO MemoryStore: Block broadcast_8 stored as values in memory (estimated size 4.3 KB, free 413.4 MB)
18/03/17 20:01:53 INFO MemoryStore: Block broadcast_8_piece0 stored as bytes in memory (estimated size 2.5 KB, free 413.4 MB)
18/03/17 20:01:53 INFO BlockManagerInfo: Added broadcast_8_piece0 in memory on 192.168.86.151:55570 (size: 2.5 KB, free: 413.9 MB)
18/03/17 20:01:53 INFO SparkContext: Created broadcast 8 from broadcast at DAGScheduler.scala:1006
18/03/17 20:01:53 INFO DAGScheduler: Submitting 2 missing tasks from ShuffleMapStage 11 (MapPartitionsRDD[18] at repartition at <console>:33) (first 15 tasks are for partitions Vector(0, 1))
18/03/17 20:01:53 INFO TaskSchedulerImpl: Adding task set 11.0 with 2 tasks
18/03/17 20:01:53 INFO TaskSetManager: Starting task 0.0 in stage 11.0 (TID 11, localhost, executor driver, partition 0, ANY, 4610 bytes)
18/03/17 20:01:53 INFO Executor: Running task 0.0 in stage 11.0 (TID 11)
18/03/17 20:01:53 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/17 20:01:53 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms
18/03/17 20:01:53 INFO Executor: Finished task 0.0 in stage 11.0 (TID 11). 1286 bytes result sent to driver
18/03/17 20:01:53 INFO TaskSetManager: Starting task 1.0 in stage 11.0 (TID 12, localhost, executor driver, partition 1, ANY, 4610 bytes)
18/03/17 20:01:53 INFO TaskSetManager: Finished task 0.0 in stage 11.0 (TID 11) in 71 ms on localhost (executor driver) (1/2)
18/03/17 20:01:53 INFO Executor: Running task 1.0 in stage 11.0 (TID 12)
18/03/17 20:01:53 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/17 20:01:53 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 9 ms
18/03/17 20:01:53 INFO Executor: Finished task 1.0 in stage 11.0 (TID 12). 1286 bytes result sent to driver
18/03/17 20:01:53 INFO TaskSetManager: Finished task 1.0 in stage 11.0 (TID 12) in 65 ms on localhost (executor driver) (2/2)
18/03/17 20:01:53 INFO TaskSchedulerImpl: Removed TaskSet 11.0, whose tasks have all completed, from pool
18/03/17 20:01:53 INFO DAGScheduler: ShuffleMapStage 11 (repartition at <console>:33) finished in 0.134 s
18/03/17 20:01:53 INFO DAGScheduler: looking for newly runnable stages
18/03/17 20:01:53 INFO DAGScheduler: running: Set()
18/03/17 20:01:53 INFO DAGScheduler: waiting: Set(ResultStage 12)
18/03/17 20:01:53 INFO DAGScheduler: failed: Set()
18/03/17 20:01:53 INFO DAGScheduler: Submitting ResultStage 12 (MapPartitionsRDD[21] at repartition at <console>:33), which has no missing parents
18/03/17 20:01:53 INFO MemoryStore: Block broadcast_9 stored as values in memory (estimated size 3.3 KB, free 413.4 MB)
18/03/17 20:01:53 INFO MemoryStore: Block broadcast_9_piece0 stored as bytes in memory (estimated size 1938.0 B, free 413.4 MB)
18/03/17 20:01:53 INFO BlockManagerInfo: Added broadcast_9_piece0 in memory on 192.168.86.151:55570 (size: 1938.0 B, free: 413.9 MB)
18/03/17 20:01:53 INFO SparkContext: Created broadcast 9 from broadcast at DAGScheduler.scala:1006
18/03/17 20:01:53 INFO DAGScheduler: Submitting 3 missing tasks from ResultStage 12 (MapPartitionsRDD[21] at repartition at <console>:33) (first 15 tasks are for partitions Vector(0, 1, 2))
18/03/17 20:01:53 INFO TaskSchedulerImpl: Adding task set 12.0 with 3 tasks
18/03/17 20:01:53 INFO TaskSetManager: Starting task 0.0 in stage 12.0 (TID 13, localhost, executor driver, partition 0, ANY, 4897 bytes)
18/03/17 20:01:53 INFO Executor: Running task 0.0 in stage 12.0 (TID 13)
18/03/17 20:01:53 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/17 20:01:53 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
18/03/17 20:01:53 INFO BlockManagerInfo: Removed broadcast_6_piece0 on 192.168.86.151:55570 in memory (size: 2.5 KB, free: 413.9 MB)
(hadoop,6)
(node,3)
(bag,2)
(tom,2)
(jkji,2)
(sqoop,2)
(kjfklds,2)
(he,2)
(jack,2)
(hjsdhj,2)
(heeef,2)
18/03/17 20:01:53 INFO Executor: Finished task 0.0 in stage 12.0 (TID 13). 966 bytes result sent to driver
18/03/17 20:01:53 INFO TaskSetManager: Starting task 1.0 in stage 12.0 (TID 14, localhost, executor driver, partition 1, ANY, 4897 bytes)
18/03/17 20:01:53 INFO TaskSetManager: Finished task 0.0 in stage 12.0 (TID 13) in 27 ms on localhost (executor driver) (1/3)
18/03/17 20:01:53 INFO Executor: Running task 1.0 in stage 12.0 (TID 14)
18/03/17 20:01:53 INFO BlockManagerInfo: Removed broadcast_7_piece0 on 192.168.86.151:55570 in memory (size: 1934.0 B, free: 413.9 MB)
18/03/17 20:01:53 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/17 20:01:53 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
(mysql,9)
(she,4)
(eggg,2)
(haad,2)
(who,2)
(word,2)
(dag,2)
(you,2)
(kjdskf,2)
(where,2)
(kjskldjf,2)
(spring,2)
18/03/17 20:01:53 INFO Executor: Finished task 1.0 in stage 12.0 (TID 14). 966 bytes result sent to driver
18/03/17 20:01:53 INFO TaskSetManager: Starting task 2.0 in stage 12.0 (TID 15, localhost, executor driver, partition 2, ANY, 4897 bytes)
18/03/17 20:01:53 INFO TaskSetManager: Finished task 1.0 in stage 12.0 (TID 14) in 24 ms on localhost (executor driver) (2/3)
18/03/17 20:01:53 INFO Executor: Running task 2.0 in stage 12.0 (TID 15)
18/03/17 20:01:53 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/17 20:01:53 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
(hive,8)
(hello,4)
(lele,2)
(kjkd,2)
(pig,2)
(kljk,2)
(me,2)
(whast,2)
(jdifj,2)
(take,2)
(spatk,2)
(hbase,2)
18/03/17 20:01:53 INFO Executor: Finished task 2.0 in stage 12.0 (TID 15). 966 bytes result sent to driver
18/03/17 20:01:53 INFO TaskSetManager: Finished task 2.0 in stage 12.0 (TID 15) in 25 ms on localhost (executor driver) (3/3)
18/03/17 20:01:53 INFO DAGScheduler: ResultStage 12 (foreach at <console>:35) finished in 0.075 s
18/03/17 20:01:53 INFO TaskSchedulerImpl: Removed TaskSet 12.0, whose tasks have all completed, from pool
18/03/17 20:01:53 INFO DAGScheduler: Job 3 finished: foreach at <console>:35, took 0.318501 s

scala>

 

 

 

Spark2.X分布式弹性数据集_spark_67

 

 

 Spark2.X分布式弹性数据集_maven_68

 

Spark2.X分布式弹性数据集_spark_69

 

 

老规矩,先启动spark-shell

 Spark2.X分布式弹性数据集_spark_70

 

Spark2.X分布式弹性数据集_spark_71

 

 Spark2.X分布式弹性数据集_maven_72

 

我们还是把之前的都先执行一次

val rdd1=sc.textFile("file:///opt/datas/stu.txt")
val rdd2=sc.textFile("file:///opt/datas/stu1.txt")
val rdd=rdd1.union(rdd2)
val lines = rdd.flatMap(x=>x.split(" ")).map(x=>(x,1)).reduceByKey((a,b)=>(a+b)).filter(x => (x._2>1)).map(x=>(x._2,x._1)).sortByKey(false).map(x=>(x._2,x._1)).toDF

 

 

Spark2.X分布式弹性数据集_spark_73

scala> val rdd1=sc.textFile("file:///opt/datas/stu.txt")
18/03/18 11:19:14 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 229.9 KB, free 413.7 MB)
18/03/18 11:19:14 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 21.4 KB, free 413.7 MB)
18/03/18 11:19:14 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on 192.168.86.151:34518 (size: 21.4 KB, free: 413.9 MB)
18/03/18 11:19:14 INFO SparkContext: Created broadcast 0 from textFile at <console>:24
rdd1: org.apache.spark.rdd.RDD[String] = file:///opt/datas/stu.txt MapPartitionsRDD[1] at textFile at <console>:24

scala> val rdd2=sc.textFile("file:///opt/datas/stu1.txt")
18/03/18 11:19:15 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 229.9 KB, free 413.5 MB)
18/03/18 11:19:15 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 21.4 KB, free 413.4 MB)
18/03/18 11:19:15 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on 192.168.86.151:34518 (size: 21.4 KB, free: 413.9 MB)
18/03/18 11:19:15 INFO SparkContext: Created broadcast 1 from textFile at <console>:24
rdd2: org.apache.spark.rdd.RDD[String] = file:///opt/datas/stu1.txt MapPartitionsRDD[3] at textFile at <console>:24

scala> val rdd=rdd1.union(rdd2)
rdd: org.apache.spark.rdd.RDD[String] = UnionRDD[4] at union at <console>:28

scala> val lines = rdd.flatMap(x=>x.split(" ")).map(x=>(x,1)).reduceByKey((a,b)=>(a+b)).filter(x => (x._2>1)).map(x=>(x._2,x._1)).sortByKey(false).map(x=>(x._2,x._1)).toDF
18/03/18 11:19:20 INFO FileInputFormat: Total input paths to process : 1
18/03/18 11:19:20 INFO FileInputFormat: Total input paths to process : 1
18/03/18 11:19:20 INFO SparkContext: Starting job: sortByKey at <console>:30
18/03/18 11:19:21 INFO DAGScheduler: Registering RDD 6 (map at <console>:30)
18/03/18 11:19:21 INFO DAGScheduler: Got job 0 (sortByKey at <console>:30) with 2 output partitions
18/03/18 11:19:21 INFO DAGScheduler: Final stage: ResultStage 1 (sortByKey at <console>:30)
18/03/18 11:19:21 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 0)
18/03/18 11:19:21 INFO DAGScheduler: Missing parents: List(ShuffleMapStage 0)
18/03/18 11:19:21 INFO DAGScheduler: Submitting ShuffleMapStage 0 (MapPartitionsRDD[6] at map at <console>:30), which has no missing parents
18/03/18 11:19:21 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 5.5 KB, free 413.4 MB)
18/03/18 11:19:21 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 3.2 KB, free 413.4 MB)
18/03/18 11:19:21 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on 192.168.86.151:34518 (size: 3.2 KB, free: 413.9 MB)
18/03/18 11:19:21 INFO SparkContext: Created broadcast 2 from broadcast at DAGScheduler.scala:1006
18/03/18 11:19:21 INFO DAGScheduler: Submitting 2 missing tasks from ShuffleMapStage 0 (MapPartitionsRDD[6] at map at <console>:30) (first 15 tasks are for partitions Vector(0, 1))
18/03/18 11:19:21 INFO TaskSchedulerImpl: Adding task set 0.0 with 2 tasks
18/03/18 11:19:21 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, executor driver, partition 0, PROCESS_LOCAL, 4940 bytes)
18/03/18 11:19:21 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
18/03/18 11:19:21 INFO HadoopRDD: Input split: file:/opt/datas/stu.txt:0+216
18/03/18 11:19:22 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 1156 bytes result sent to driver
18/03/18 11:19:22 INFO TaskSetManager: Starting task 1.0 in stage 0.0 (TID 1, localhost, executor driver, partition 1, PROCESS_LOCAL, 4941 bytes)
18/03/18 11:19:22 INFO Executor: Running task 1.0 in stage 0.0 (TID 1)
18/03/18 11:19:22 INFO HadoopRDD: Input split: file:/opt/datas/stu1.txt:0+309
18/03/18 11:19:22 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 662 ms on localhost (executor driver) (1/2)
18/03/18 11:19:22 INFO Executor: Finished task 1.0 in stage 0.0 (TID 1). 1113 bytes result sent to driver
18/03/18 11:19:22 INFO TaskSetManager: Finished task 1.0 in stage 0.0 (TID 1) in 150 ms on localhost (executor driver) (2/2)
18/03/18 11:19:22 INFO DAGScheduler: ShuffleMapStage 0 (map at <console>:30) finished in 0.806 s
18/03/18 11:19:22 INFO DAGScheduler: looking for newly runnable stages
18/03/18 11:19:22 INFO DAGScheduler: running: Set()
18/03/18 11:19:22 INFO DAGScheduler: waiting: Set(ResultStage 1)
18/03/18 11:19:22 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
18/03/18 11:19:22 INFO DAGScheduler: failed: Set()
18/03/18 11:19:22 INFO DAGScheduler: Submitting ResultStage 1 (MapPartitionsRDD[11] at sortByKey at <console>:30), which has no missing parents
18/03/18 11:19:22 INFO MemoryStore: Block broadcast_3 stored as values in memory (estimated size 4.3 KB, free 413.4 MB)
18/03/18 11:19:22 INFO MemoryStore: Block broadcast_3_piece0 stored as bytes in memory (estimated size 2.4 KB, free 413.4 MB)
18/03/18 11:19:22 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on 192.168.86.151:34518 (size: 2.4 KB, free: 413.9 MB)
18/03/18 11:19:22 INFO SparkContext: Created broadcast 3 from broadcast at DAGScheduler.scala:1006
18/03/18 11:19:22 INFO DAGScheduler: Submitting 2 missing tasks from ResultStage 1 (MapPartitionsRDD[11] at sortByKey at <console>:30) (first 15 tasks are for partitions Vector(0, 1))
18/03/18 11:19:22 INFO TaskSchedulerImpl: Adding task set 1.0 with 2 tasks
18/03/18 11:19:22 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 2, localhost, executor driver, partition 0, ANY, 4621 bytes)
18/03/18 11:19:22 INFO Executor: Running task 0.0 in stage 1.0 (TID 2)
18/03/18 11:19:22 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/18 11:19:22 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 18 ms
18/03/18 11:19:22 INFO Executor: Finished task 0.0 in stage 1.0 (TID 2). 1439 bytes result sent to driver
18/03/18 11:19:22 INFO TaskSetManager: Starting task 1.0 in stage 1.0 (TID 3, localhost, executor driver, partition 1, ANY, 4621 bytes)
18/03/18 11:19:22 INFO Executor: Running task 1.0 in stage 1.0 (TID 3)
18/03/18 11:19:22 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 2) in 210 ms on localhost (executor driver) (1/2)
18/03/18 11:19:22 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/18 11:19:22 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
18/03/18 11:19:22 INFO Executor: Finished task 1.0 in stage 1.0 (TID 3). 1384 bytes result sent to driver
18/03/18 11:19:22 INFO TaskSetManager: Finished task 1.0 in stage 1.0 (TID 3) in 75 ms on localhost (executor driver) (2/2)
18/03/18 11:19:22 INFO DAGScheduler: ResultStage 1 (sortByKey at <console>:30) finished in 0.275 s
18/03/18 11:19:22 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool
18/03/18 11:19:22 INFO DAGScheduler: Job 0 finished: sortByKey at <console>:30, took 2.315575 s
18/03/18 11:19:24 INFO BlockManagerInfo: Removed broadcast_3_piece0 on 192.168.86.151:34518 in memory (size: 2.4 KB, free: 413.9 MB)
18/03/18 11:19:24 INFO BlockManagerInfo: Removed broadcast_2_piece0 on 192.168.86.151:34518 in memory (size: 3.2 KB, free: 413.9 MB)
lines: org.apache.spark.sql.DataFrame = [_1: string, _2: int]

scala>

 

 

Spark2.X分布式弹性数据集_spark_74

Spark2.X分布式弹性数据集_maven_75

 

 Spark2.X分布式弹性数据集_maven_76

 

 

 Spark2.X分布式弹性数据集_jar_77

 

 Spark2.X分布式弹性数据集_maven_78

 

 

Spark2.X分布式弹性数据集_spark_79

 

 

Spark2.X分布式弹性数据集_jar_80

 

 Spark2.X分布式弹性数据集_spark_81

 

 

scala> lines.createOrReplaceTempView("spark")
18/03/18 11:30:07 INFO SparkSqlParser: Parsing command: spark
18/03/18 11:30:07 INFO ContextCleaner: Cleaned accumulator 120
18/03/18 11:30:07 INFO BlockManagerInfo: Removed broadcast_6_piece0 on 192.168.86.151:34518 in memory (size: 2.5 KB, free: 413.9 MB)
18/03/18 11:30:07 INFO ContextCleaner: Cleaned accumulator 121
18/03/18 11:30:07 INFO BlockManagerInfo: Removed broadcast_7_piece0 on 192.168.86.151:34518 in memory (size: 4.7 KB, free: 413.9 MB)
18/03/18 11:30:07 INFO BlockManagerInfo: Removed broadcast_8_piece0 on 192.168.86.151:34518 in memory (size: 4.7 KB, free: 413.9 MB)

scala> spark.sql("select * from spark")
18/03/18 11:30:44 INFO SparkSqlParser: Parsing command: select * from spark
res3: org.apache.spark.sql.DataFrame = [key: string, value: int]

scala> spark.sql("select * from spark").show()
18/03/18 11:32:29 INFO SparkSqlParser: Parsing command: select * from spark
18/03/18 11:32:30 INFO CodeGenerator: Code generated in 122.293778 ms
18/03/18 11:32:30 INFO SparkContext: Starting job: show at <console>:24
18/03/18 11:32:30 INFO DAGScheduler: Got job 4 (show at <console>:24) with 1 output partitions
18/03/18 11:32:30 INFO DAGScheduler: Final stage: ResultStage 12 (show at <console>:24)
18/03/18 11:32:30 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 11)
18/03/18 11:32:30 INFO DAGScheduler: Missing parents: List()
18/03/18 11:32:30 INFO DAGScheduler: Submitting ResultStage 12 (MapPartitionsRDD[28] at show at <console>:24), which has no missing parents
18/03/18 11:32:30 INFO MemoryStore: Block broadcast_9 stored as values in memory (estimated size 10.0 KB, free 413.4 MB)
18/03/18 11:32:30 INFO MemoryStore: Block broadcast_9_piece0 stored as bytes in memory (estimated size 4.9 KB, free 413.4 MB)
18/03/18 11:32:30 INFO BlockManagerInfo: Added broadcast_9_piece0 in memory on 192.168.86.151:34518 (size: 4.9 KB, free: 413.9 MB)
18/03/18 11:32:30 INFO SparkContext: Created broadcast 9 from broadcast at DAGScheduler.scala:1006
18/03/18 11:32:30 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 12 (MapPartitionsRDD[28] at show at <console>:24) (first 15 tasks are for partitions Vector(0))
18/03/18 11:32:30 INFO TaskSchedulerImpl: Adding task set 12.0 with 1 tasks
18/03/18 11:32:30 INFO TaskSetManager: Starting task 0.0 in stage 12.0 (TID 12, localhost, executor driver, partition 0, ANY, 4621 bytes)
18/03/18 11:32:30 INFO Executor: Running task 0.0 in stage 12.0 (TID 12)
18/03/18 11:32:30 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/18 11:32:30 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
18/03/18 11:32:30 INFO Executor: Finished task 0.0 in stage 12.0 (TID 12). 1608 bytes result sent to driver
18/03/18 11:32:30 INFO TaskSetManager: Finished task 0.0 in stage 12.0 (TID 12) in 67 ms on localhost (executor driver) (1/1)
18/03/18 11:32:30 INFO TaskSchedulerImpl: Removed TaskSet 12.0, whose tasks have all completed, from pool
18/03/18 11:32:30 INFO DAGScheduler: ResultStage 12 (show at <console>:24) finished in 0.065 s
18/03/18 11:32:30 INFO DAGScheduler: Job 4 finished: show at <console>:24, took 0.108798 s
18/03/18 11:32:30 INFO SparkContext: Starting job: show at <console>:24
18/03/18 11:32:30 INFO DAGScheduler: Got job 5 (show at <console>:24) with 1 output partitions
18/03/18 11:32:30 INFO DAGScheduler: Final stage: ResultStage 15 (show at <console>:24)
18/03/18 11:32:30 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 14)
18/03/18 11:32:30 INFO DAGScheduler: Missing parents: List()
18/03/18 11:32:30 INFO DAGScheduler: Submitting ResultStage 15 (MapPartitionsRDD[28] at show at <console>:24), which has no missing parents
18/03/18 11:32:30 INFO MemoryStore: Block broadcast_10 stored as values in memory (estimated size 10.0 KB, free 413.4 MB)
18/03/18 11:32:30 INFO MemoryStore: Block broadcast_10_piece0 stored as bytes in memory (estimated size 4.9 KB, free 413.4 MB)
18/03/18 11:32:30 INFO BlockManagerInfo: Added broadcast_10_piece0 in memory on 192.168.86.151:34518 (size: 4.9 KB, free: 413.9 MB)
18/03/18 11:32:30 INFO SparkContext: Created broadcast 10 from broadcast at DAGScheduler.scala:1006
18/03/18 11:32:30 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 15 (MapPartitionsRDD[28] at show at <console>:24) (first 15 tasks are for partitions Vector(1))
18/03/18 11:32:30 INFO TaskSchedulerImpl: Adding task set 15.0 with 1 tasks
18/03/18 11:32:30 INFO TaskSetManager: Starting task 0.0 in stage 15.0 (TID 13, localhost, executor driver, partition 1, ANY, 4621 bytes)
18/03/18 11:32:30 INFO Executor: Running task 0.0 in stage 15.0 (TID 13)
18/03/18 11:32:30 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/18 11:32:30 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
18/03/18 11:32:30 INFO Executor: Finished task 0.0 in stage 15.0 (TID 13). 1731 bytes result sent to driver
18/03/18 11:32:30 INFO TaskSetManager: Finished task 0.0 in stage 15.0 (TID 13) in 47 ms on localhost (executor driver) (1/1)
18/03/18 11:32:30 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose tasks have all completed, from pool
18/03/18 11:32:30 INFO DAGScheduler: ResultStage 15 (show at <console>:24) finished in 0.046 s
18/03/18 11:32:30 INFO DAGScheduler: Job 5 finished: show at <console>:24, took 0.084031 s
18/03/18 11:32:30 INFO CodeGenerator: Code generated in 47.514407 ms
+------+-----+
| key|value|
+------+-----+
| mysql| 9|
| hive| 8|
|hadoop| 6|
| she| 4|
| hello| 4|
| node| 3|
| eggg| 2|
| lele| 2|
| bag| 2|
| haad| 2|
| kjkd| 2|
| tom| 2|
| who| 2|
| pig| 2|
| jkji| 2|
| word| 2|
| kljk| 2|
| sqoop| 2|
| dag| 2|
| me| 2|
+------+-----+
only showing top 20 rows


scala>

 

 

 

Spark2.X分布式弹性数据集_spark_82

 

 Spark2.X分布式弹性数据集_jar_83

 

 

scala> spark.sql("select key from spark").show()
18/03/18 11:34:29 INFO ContextCleaner: Cleaned accumulator 194
18/03/18 11:34:29 INFO BlockManagerInfo: Removed broadcast_9_piece0 on 192.168.86.151:34518 in memory (size: 4.9 KB, free: 413.9 MB)
18/03/18 11:34:29 INFO SparkSqlParser: Parsing command: select key from spark
18/03/18 11:34:29 INFO ContextCleaner: Cleaned accumulator 195
18/03/18 11:34:29 INFO BlockManagerInfo: Removed broadcast_10_piece0 on 192.168.86.151:34518 in memory (size: 4.9 KB, free: 413.9 MB)
18/03/18 11:34:29 INFO SparkContext: Starting job: show at <console>:24
18/03/18 11:34:29 INFO DAGScheduler: Got job 6 (show at <console>:24) with 1 output partitions
18/03/18 11:34:29 INFO DAGScheduler: Final stage: ResultStage 18 (show at <console>:24)
18/03/18 11:34:29 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 17)
18/03/18 11:34:29 INFO DAGScheduler: Missing parents: List()
18/03/18 11:34:29 INFO DAGScheduler: Submitting ResultStage 18 (MapPartitionsRDD[31] at show at <console>:24), which has no missing parents
18/03/18 11:34:29 INFO MemoryStore: Block broadcast_11 stored as values in memory (estimated size 9.4 KB, free 413.4 MB)
18/03/18 11:34:29 INFO MemoryStore: Block broadcast_11_piece0 stored as bytes in memory (estimated size 4.7 KB, free 413.4 MB)
18/03/18 11:34:29 INFO BlockManagerInfo: Added broadcast_11_piece0 in memory on 192.168.86.151:34518 (size: 4.7 KB, free: 413.9 MB)
18/03/18 11:34:29 INFO SparkContext: Created broadcast 11 from broadcast at DAGScheduler.scala:1006
18/03/18 11:34:29 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 18 (MapPartitionsRDD[31] at show at <console>:24) (first 15 tasks are for partitions Vector(0))
18/03/18 11:34:29 INFO TaskSchedulerImpl: Adding task set 18.0 with 1 tasks
18/03/18 11:34:29 INFO TaskSetManager: Starting task 0.0 in stage 18.0 (TID 14, localhost, executor driver, partition 0, ANY, 4621 bytes)
18/03/18 11:34:29 INFO Executor: Running task 0.0 in stage 18.0 (TID 14)
18/03/18 11:34:29 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/18 11:34:29 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
18/03/18 11:34:29 INFO Executor: Finished task 0.0 in stage 18.0 (TID 14). 1574 bytes result sent to driver
18/03/18 11:34:29 INFO TaskSetManager: Finished task 0.0 in stage 18.0 (TID 14) in 51 ms on localhost (executor driver) (1/1)
18/03/18 11:34:29 INFO DAGScheduler: ResultStage 18 (show at <console>:24) finished in 0.051 s
18/03/18 11:34:29 INFO DAGScheduler: Job 6 finished: show at <console>:24, took 0.080995 s
18/03/18 11:34:29 INFO TaskSchedulerImpl: Removed TaskSet 18.0, whose tasks have all completed, from pool
18/03/18 11:34:29 INFO SparkContext: Starting job: show at <console>:24
18/03/18 11:34:29 INFO DAGScheduler: Got job 7 (show at <console>:24) with 1 output partitions
18/03/18 11:34:29 INFO DAGScheduler: Final stage: ResultStage 21 (show at <console>:24)
18/03/18 11:34:29 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 20)
18/03/18 11:34:29 INFO DAGScheduler: Missing parents: List()
18/03/18 11:34:29 INFO DAGScheduler: Submitting ResultStage 21 (MapPartitionsRDD[31] at show at <console>:24), which has no missing parents
18/03/18 11:34:29 INFO MemoryStore: Block broadcast_12 stored as values in memory (estimated size 9.4 KB, free 413.4 MB)
18/03/18 11:34:29 INFO MemoryStore: Block broadcast_12_piece0 stored as bytes in memory (estimated size 4.7 KB, free 413.4 MB)
18/03/18 11:34:29 INFO BlockManagerInfo: Added broadcast_12_piece0 in memory on 192.168.86.151:34518 (size: 4.7 KB, free: 413.9 MB)
18/03/18 11:34:29 INFO SparkContext: Created broadcast 12 from broadcast at DAGScheduler.scala:1006
18/03/18 11:34:29 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 21 (MapPartitionsRDD[31] at show at <console>:24) (first 15 tasks are for partitions Vector(1))
18/03/18 11:34:29 INFO TaskSchedulerImpl: Adding task set 21.0 with 1 tasks
18/03/18 11:34:29 INFO TaskSetManager: Starting task 0.0 in stage 21.0 (TID 15, localhost, executor driver, partition 1, ANY, 4621 bytes)
18/03/18 11:34:29 INFO Executor: Running task 0.0 in stage 21.0 (TID 15)
18/03/18 11:34:29 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
18/03/18 11:34:29 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
18/03/18 11:34:29 INFO Executor: Finished task 0.0 in stage 21.0 (TID 15). 1722 bytes result sent to driver
18/03/18 11:34:29 INFO TaskSetManager: Finished task 0.0 in stage 21.0 (TID 15) in 58 ms on localhost (executor driver) (1/1)
18/03/18 11:34:29 INFO DAGScheduler: ResultStage 21 (show at <console>:24) finished in 0.053 s
18/03/18 11:34:29 INFO DAGScheduler: Job 7 finished: show at <console>:24, took 0.096304 s
18/03/18 11:34:29 INFO TaskSchedulerImpl: Removed TaskSet 21.0, whose tasks have all completed, from pool
+------+
| key|
+------+
| mysql|
| hive|
|hadoop|
| she|
| hello|
| node|
| eggg|
| lele|
| bag|
| haad|
| kjkd|
| tom|
| who|
| pig|
| jkji|
| word|
| kljk|
| sqoop|
| dag|
| me|
+------+
only showing top 20 rows


scala>

 

 

 

Spark2.X分布式弹性数据集_spark_84

 

 Spark2.X分布式弹性数据集_jar_85

 

 Spark2.X分布式弹性数据集_spark_86

 

 Spark2.X分布式弹性数据集_spark_87

 

Spark2.X分布式弹性数据集_spark_88

 

 

scala> val dataSet = spark.read.textFile("file:///opt/datas/stu.txt")
dataSet: org.apache.spark.sql.Dataset[String] = [value: string]

scala> val dataSet = spark.read.textFile("file:///opt/datas/stu.txt").flatMap(x=>x.split(" ")).map(x=>(x,1))
dataSet: org.apache.spark.sql.Dataset[(String, Int)] = [_1: string, _2: int]

 

 

Spark2.X分布式弹性数据集_jar_89

 

 Spark2.X分布式弹性数据集_spark_90

 

 

 Spark2.X分布式弹性数据集_spark_91

 

 

 Spark2.X分布式弹性数据集_spark_92

 

 Spark2.X分布式弹性数据集_spark_93

 

 

Spark2.X分布式弹性数据集_jar_94

 

 Spark2.X分布式弹性数据集_jar_95

 

 Spark2.X分布式弹性数据集_spark_96

 

scala> val dataSet = spark.read.textFile("file:///opt/datas/stu.txt").flatMap(x=>x.split(" ")).map(x=>(x,1)).toDF("key","value") 
dataSet: org.apache.spark.sql.DataFrame = [key: string, value: int]

scala> val dataSet = spark.read.textFile("file:///opt/datas/stu.txt").flatMap(x=>x.split(" ")).map(x=>(x,1)).toDF("key","value").show()
18/03/18 11:53:31 INFO ContextCleaner: Cleaned accumulator 387
18/03/18 11:53:31 INFO BlockManagerInfo: Removed broadcast_17_piece0 on 192.168.86.151:34518 in memory (size: 21.9 KB, free: 413.9 MB)
18/03/18 11:53:31 INFO ContextCleaner: Cleaned accumulator 389
18/03/18 11:53:31 INFO BlockManagerInfo: Removed broadcast_18_piece0 on 192.168.86.151:34518 in memory (size: 7.0 KB, free: 413.9 MB)
18/03/18 11:53:31 INFO ContextCleaner: Cleaned accumulator 388
18/03/18 11:53:31 INFO ContextCleaner: Cleaned accumulator 386
18/03/18 11:53:31 INFO ContextCleaner: Cleaned accumulator 390
18/03/18 11:53:31 INFO ContextCleaner: Cleaned accumulator 385
18/03/18 11:53:32 INFO FileSourceStrategy: Pruning directories with:
18/03/18 11:53:32 INFO FileSourceStrategy: Post-Scan Filters:
18/03/18 11:53:32 INFO FileSourceStrategy: Output Data Schema: struct<value: string>
18/03/18 11:53:32 INFO FileSourceScanExec: Pushed Filters:
18/03/18 11:53:32 INFO CodeGenerator: Code generated in 108.532527 ms
18/03/18 11:53:32 INFO MemoryStore: Block broadcast_19 stored as values in memory (estimated size 238.0 KB, free 413.2 MB)
18/03/18 11:53:32 INFO MemoryStore: Block broadcast_19_piece0 stored as bytes in memory (estimated size 21.9 KB, free 413.2 MB)
18/03/18 11:53:32 INFO BlockManagerInfo: Added broadcast_19_piece0 in memory on 192.168.86.151:34518 (size: 21.9 KB, free: 413.9 MB)
18/03/18 11:53:32 INFO SparkContext: Created broadcast 19 from show at <console>:23
18/03/18 11:53:32 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4194520 bytes, open cost is considered as scanning 4194304 bytes.
18/03/18 11:53:33 INFO SparkContext: Starting job: show at <console>:23
18/03/18 11:53:33 INFO DAGScheduler: Got job 11 (show at <console>:23) with 1 output partitions
18/03/18 11:53:33 INFO DAGScheduler: Final stage: ResultStage 28 (show at <console>:23)
18/03/18 11:53:33 INFO DAGScheduler: Parents of final stage: List()
18/03/18 11:53:33 INFO DAGScheduler: Missing parents: List()
18/03/18 11:53:33 INFO DAGScheduler: Submitting ResultStage 28 (MapPartitionsRDD[55] at show at <console>:23), which has no missing parents
18/03/18 11:53:33 INFO MemoryStore: Block broadcast_20 stored as values in memory (estimated size 16.0 KB, free 413.2 MB)
18/03/18 11:53:33 INFO MemoryStore: Block broadcast_20_piece0 stored as bytes in memory (estimated size 7.0 KB, free 413.2 MB)
18/03/18 11:53:33 INFO BlockManagerInfo: Added broadcast_20_piece0 in memory on 192.168.86.151:34518 (size: 7.0 KB, free: 413.9 MB)
18/03/18 11:53:33 INFO SparkContext: Created broadcast 20 from broadcast at DAGScheduler.scala:1006
18/03/18 11:53:33 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 28 (MapPartitionsRDD[55] at show at <console>:23) (first 15 tasks are for partitions Vector(0))
18/03/18 11:53:33 INFO TaskSchedulerImpl: Adding task set 28.0 with 1 tasks
18/03/18 11:53:33 INFO TaskSetManager: Starting task 0.0 in stage 28.0 (TID 21, localhost, executor driver, partition 0, PROCESS_LOCAL, 5262 bytes)
18/03/18 11:53:33 INFO Executor: Running task 0.0 in stage 28.0 (TID 21)
18/03/18 11:53:33 INFO FileScanRDD: Reading File path: file:///opt/datas/stu.txt, range: 0-216, partition values: [empty row]
18/03/18 11:53:33 INFO Executor: Finished task 0.0 in stage 28.0 (TID 21). 1786 bytes result sent to driver
18/03/18 11:53:33 INFO TaskSetManager: Finished task 0.0 in stage 28.0 (TID 21) in 49 ms on localhost (executor driver) (1/1)
18/03/18 11:53:33 INFO DAGScheduler: ResultStage 28 (show at <console>:23) finished in 0.049 s
18/03/18 11:53:33 INFO DAGScheduler: Job 11 finished: show at <console>:23, took 0.087955 s
18/03/18 11:53:33 INFO TaskSchedulerImpl: Removed TaskSet 28.0, whose tasks have all completed, from pool
+--------+-----+
| key|value|
+--------+-----+
| hadoop| 1|
| spatk| 1|
| haad| 1|
| heeef| 1|
| hjsdhj| 1|
|kjskldjf| 1|
| kjfklds| 1|
| jkji| 1|
| jdifj| 1|
| take| 1|
| she| 1|
| she| 1|
| word| 1|
| hello| 1|
| hive| 1|
| mysql| 1|
| hbase| 1|
| sqoop| 1|
| pig| 1|
| dag| 1|
+--------+-----+
only showing top 20 rows

dataSet: Unit = ()

scala>

 

 

Spark2.X分布式弹性数据集_maven_97

 

scala> val dataSet = spark.read.textFile("file:///opt/datas/stu.txt").flatMap(x=>x.split(" ")).map(x=>(x,1)).toDF("key","value")

dataSet: org.apache.spark.sql.DataFrame = [key: string, value: int]

scala> val dataSet = spark.read.textFile("file:///opt/datas/stu.txt").flatMap(x=>x.split(" ")).map(x=>(x,1)).toDF("key","value")
.createOrReplaceTempView("spark")
18/03/18 11:59:59 INFO ContextCleaner: Cleaned accumulator 419
18/03/18 11:59:59 INFO ContextCleaner: Cleaned accumulator 415
18/03/18 11:59:59 INFO BlockManagerInfo: Removed broadcast_19_piece0 on 192.168.86.151:34518 in memory (size: 21.9 KB, free: 413.9 MB)
18/03/18 11:59:59 INFO ContextCleaner: Cleaned accumulator 420
18/03/18 11:59:59 INFO BlockManagerInfo: Removed broadcast_20_piece0 on 192.168.86.151:34518 in memory (size: 7.0 KB, free: 413.9 MB)
18/03/18 11:59:59 INFO ContextCleaner: Cleaned accumulator 418
18/03/18 11:59:59 INFO ContextCleaner: Cleaned accumulator 417
18/03/18 11:59:59 INFO ContextCleaner: Cleaned accumulator 416
18/03/18 12:00:00 INFO SparkSqlParser: Parsing command: spark
dataSet: Unit = ()

scala>

 

 

 

 Spark2.X分布式弹性数据集_maven_98

 

 Spark2.X分布式弹性数据集_maven_99

 

scala> spark.sql("select * from spark")
18/03/18 12:09:46 INFO SparkSqlParser: Parsing command: select * from spark
res7: org.apache.spark.sql.DataFrame = [key: string, value: int]

scala> spark.sql("select * from spark").show()
18/03/18 12:09:57 INFO SparkSqlParser: Parsing command: select * from spark
18/03/18 12:09:57 INFO FileSourceStrategy: Pruning directories with:
18/03/18 12:09:57 INFO FileSourceStrategy: Post-Scan Filters:
18/03/18 12:09:57 INFO FileSourceStrategy: Output Data Schema: struct<value: string>
18/03/18 12:09:57 INFO FileSourceScanExec: Pushed Filters:
18/03/18 12:09:57 INFO MemoryStore: Block broadcast_21 stored as values in memory (estimated size 238.0 KB, free 413.2 MB)
18/03/18 12:09:57 INFO MemoryStore: Block broadcast_21_piece0 stored as bytes in memory (estimated size 21.9 KB, free 413.2 MB)
18/03/18 12:09:57 INFO BlockManagerInfo: Added broadcast_21_piece0 in memory on 192.168.86.151:34518 (size: 21.9 KB, free: 413.9 MB)
18/03/18 12:09:57 INFO SparkContext: Created broadcast 21 from show at <console>:24
18/03/18 12:09:57 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4194520 bytes, open cost is considered as scanning 4194304 bytes.
18/03/18 12:09:57 INFO SparkContext: Starting job: show at <console>:24
18/03/18 12:09:57 INFO DAGScheduler: Got job 12 (show at <console>:24) with 1 output partitions
18/03/18 12:09:57 INFO DAGScheduler: Final stage: ResultStage 29 (show at <console>:24)
18/03/18 12:09:57 INFO DAGScheduler: Parents of final stage: List()
18/03/18 12:09:57 INFO DAGScheduler: Missing parents: List()
18/03/18 12:09:57 INFO DAGScheduler: Submitting ResultStage 29 (MapPartitionsRDD[61] at show at <console>:24), which has no missing parents
18/03/18 12:09:57 INFO MemoryStore: Block broadcast_22 stored as values in memory (estimated size 16.0 KB, free 413.2 MB)
18/03/18 12:09:57 INFO MemoryStore: Block broadcast_22_piece0 stored as bytes in memory (estimated size 7.0 KB, free 413.2 MB)
18/03/18 12:09:57 INFO BlockManagerInfo: Added broadcast_22_piece0 in memory on 192.168.86.151:34518 (size: 7.0 KB, free: 413.9 MB)
18/03/18 12:09:57 INFO SparkContext: Created broadcast 22 from broadcast at DAGScheduler.scala:1006
18/03/18 12:09:57 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 29 (MapPartitionsRDD[61] at show at <console>:24) (first 15 tasks are for partitions Vector(0))
18/03/18 12:09:57 INFO TaskSchedulerImpl: Adding task set 29.0 with 1 tasks
18/03/18 12:09:57 INFO TaskSetManager: Starting task 0.0 in stage 29.0 (TID 22, localhost, executor driver, partition 0, PROCESS_LOCAL, 5262 bytes)
18/03/18 12:09:57 INFO Executor: Running task 0.0 in stage 29.0 (TID 22)
18/03/18 12:09:57 INFO FileScanRDD: Reading File path: file:///opt/datas/stu.txt, range: 0-216, partition values: [empty row]
18/03/18 12:09:57 INFO Executor: Finished task 0.0 in stage 29.0 (TID 22). 1786 bytes result sent to driver
18/03/18 12:09:57 INFO TaskSetManager: Finished task 0.0 in stage 29.0 (TID 22) in 50 ms on localhost (executor driver) (1/1)
18/03/18 12:09:57 INFO TaskSchedulerImpl: Removed TaskSet 29.0, whose tasks have all completed, from pool
18/03/18 12:09:57 INFO DAGScheduler: ResultStage 29 (show at <console>:24) finished in 0.050 s
18/03/18 12:09:57 INFO DAGScheduler: Job 12 finished: show at <console>:24, took 0.083076 s
+--------+-----+
| key|value|
+--------+-----+
| hadoop| 1|
| spatk| 1|
| haad| 1|
| heeef| 1|
| hjsdhj| 1|
|kjskldjf| 1|
| kjfklds| 1|
| jkji| 1|
| jdifj| 1|
| take| 1|
| she| 1|
| she| 1|
| word| 1|
| hello| 1|
| hive| 1|
| mysql| 1|
| hbase| 1|
| sqoop| 1|
| pig| 1|
| dag| 1|
+--------+-----+
only showing top 20 rows


scala>

 

 

Spark2.X分布式弹性数据集_spark_100

 

 Spark2.X分布式弹性数据集_spark_101

 

 

Spark2.X分布式弹性数据集_spark_102

 

Spark2.X分布式弹性数据集_jar_103

 Spark2.X分布式弹性数据集_spark_104

 

 

Spark2.X分布式弹性数据集_spark_105

Spark2.X分布式弹性数据集_maven_106

 

 

Spark2.X分布式弹性数据集_spark_107

 

 Spark2.X分布式弹性数据集_maven_108

 

 

Spark2.X分布式弹性数据集_spark_109

 

 Spark2.X分布式弹性数据集_maven_110

 

 Spark2.X分布式弹性数据集_maven_111

 

 Spark2.X分布式弹性数据集_jar_112

 

Spark2.X分布式弹性数据集_maven_113

Spark2.X分布式弹性数据集_jar_114

 

 

Spark2.X分布式弹性数据集_jar_115

Spark2.X分布式弹性数据集_spark_116

Spark2.X分布式弹性数据集_spark_117

 

 

 

Spark2.X分布式弹性数据集_spark_118

 

 Spark2.X分布式弹性数据集_jar_119

 

 Spark2.X分布式弹性数据集_maven_120

 

Spark2.X分布式弹性数据集_maven_121

 

 

scala> lines.select("_1","_2").show
18/03/18 16:00:45 INFO FileSourceStrategy: Pruning directories with:
18/03/18 16:00:45 INFO FileSourceStrategy: Post-Scan Filters:
18/03/18 16:00:45 INFO FileSourceStrategy: Output Data Schema: struct<value: string>
18/03/18 16:00:45 INFO FileSourceScanExec: Pushed Filters:
18/03/18 16:00:45 INFO CodeGenerator: Code generated in 123.289052 ms
18/03/18 16:00:45 INFO MemoryStore: Block broadcast_6 stored as values in memory (estimated size 238.0 KB, free 413.4 MB)
18/03/18 16:00:45 INFO MemoryStore: Block broadcast_6_piece0 stored as bytes in memory (estimated size 21.9 KB, free 413.4 MB)
18/03/18 16:00:45 INFO BlockManagerInfo: Added broadcast_6_piece0 in memory on 192.168.86.151:50190 (size: 21.9 KB, free: 413.9 MB)
18/03/18 16:00:45 INFO SparkContext: Created broadcast 6 from show at <console>:28
18/03/18 16:00:45 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4194520 bytes, open cost is considered as scanning 4194304 bytes.
18/03/18 16:00:45 INFO SparkContext: Starting job: show at <console>:28
18/03/18 16:00:45 INFO DAGScheduler: Got job 3 (show at <console>:28) with 1 output partitions
18/03/18 16:00:45 INFO DAGScheduler: Final stage: ResultStage 3 (show at <console>:28)
18/03/18 16:00:45 INFO DAGScheduler: Parents of final stage: List()
18/03/18 16:00:45 INFO DAGScheduler: Missing parents: List()
18/03/18 16:00:45 INFO DAGScheduler: Submitting ResultStage 3 (MapPartitionsRDD[20] at show at <console>:28), which has no missing parents
18/03/18 16:00:45 INFO MemoryStore: Block broadcast_7 stored as values in memory (estimated size 15.7 KB, free 413.4 MB)
18/03/18 16:00:45 INFO MemoryStore: Block broadcast_7_piece0 stored as bytes in memory (estimated size 7.0 KB, free 413.4 MB)
18/03/18 16:00:45 INFO BlockManagerInfo: Added broadcast_7_piece0 in memory on 192.168.86.151:50190 (size: 7.0 KB, free: 413.9 MB)
18/03/18 16:00:45 INFO SparkContext: Created broadcast 7 from broadcast at DAGScheduler.scala:1006
18/03/18 16:00:45 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 3 (MapPartitionsRDD[20] at show at <console>:28) (first 15 tasks are for partitions Vector(0))
18/03/18 16:00:45 INFO TaskSchedulerImpl: Adding task set 3.0 with 1 tasks
18/03/18 16:00:45 INFO TaskSetManager: Starting task 0.0 in stage 3.0 (TID 3, localhost, executor driver, partition 0, PROCESS_LOCAL, 5262 bytes)
18/03/18 16:00:45 INFO Executor: Running task 0.0 in stage 3.0 (TID 3)
18/03/18 16:00:45 INFO FileScanRDD: Reading File path: file:///opt/datas/stu.txt, range: 0-216, partition values: [empty row]
18/03/18 16:00:45 INFO Executor: Finished task 0.0 in stage 3.0 (TID 3). 1786 bytes result sent to driver
18/03/18 16:00:45 INFO TaskSetManager: Finished task 0.0 in stage 3.0 (TID 3) in 76 ms on localhost (executor driver) (1/1)
18/03/18 16:00:45 INFO DAGScheduler: ResultStage 3 (show at <console>:28) finished in 0.076 s
18/03/18 16:00:45 INFO DAGScheduler: Job 3 finished: show at <console>:28, took 0.113333 s
18/03/18 16:00:45 INFO TaskSchedulerImpl: Removed TaskSet 3.0, whose tasks have all completed, from pool
18/03/18 16:00:45 INFO CodeGenerator: Code generated in 46.614685 ms
+--------+---+
| _1| _2|
+--------+---+
| hadoop| 1|
| spatk| 1|
| haad| 1|
| heeef| 1|
| hjsdhj| 1|
|kjskldjf| 1|
| kjfklds| 1|
| jkji| 1|
| jdifj| 1|
| take| 1|
| she| 1|
| she| 1|
| word| 1|
| hello| 1|
| hive| 1|
| mysql| 1|
| hbase| 1|
| sqoop| 1|
| pig| 1|
| dag| 1|
+--------+---+
only showing top 20 rows


scala>

 

 

 

 

Spark2.X分布式弹性数据集_spark_122

Spark2.X分布式弹性数据集_spark_123

 

 

 Spark2.X分布式弹性数据集_jar_124

 

 

 

Spark2.X分布式弹性数据集_maven_125

Spark2.X分布式弹性数据集_spark_126

 

 

Spark2.X分布式弹性数据集_spark_127

 

Spark2.X分布式弹性数据集_jar_128

 

Spark2.X分布式弹性数据集_jar_129

 

 

Spark2.X分布式弹性数据集_jar_130

 

 Spark2.X分布式弹性数据集_jar_131

 

Spark2.X分布式弹性数据集_spark_132

 Spark2.X分布式弹性数据集_jar_133

 

 Spark2.X分布式弹性数据集_jar_134

 

 

 Spark2.X分布式弹性数据集_jar_135

 

 

 Spark2.X分布式弹性数据集_spark_136

 

 Spark2.X分布式弹性数据集_maven_137

 

 Spark2.X分布式弹性数据集_spark_138

 

 Spark2.X分布式弹性数据集_maven_139

 

 Spark2.X分布式弹性数据集_jar_140

 

Spark2.X分布式弹性数据集_jar_141

 

 

 Spark2.X分布式弹性数据集_jar_142

 

 Spark2.X分布式弹性数据集_maven_143

 

 

 

 

 

Spark2.X分布式弹性数据集_maven_144

 

 Spark2.X分布式弹性数据集_jar_145

 

 Spark2.X分布式弹性数据集_spark_146

 

 Spark2.X分布式弹性数据集_maven_147

 

Spark2.X分布式弹性数据集_spark_148

 

 Spark2.X分布式弹性数据集_spark_149

 

 Spark2.X分布式弹性数据集_jar_150

 

 Spark2.X分布式弹性数据集_spark_151

 

 Spark2.X分布式弹性数据集_spark_152

 

Spark2.X分布式弹性数据集_jar_153

 

 

 Spark2.X分布式弹性数据集_maven_154

 

 Spark2.X分布式弹性数据集_spark_155

 

 Spark2.X分布式弹性数据集_spark_156

 

 

 

Spark2.X分布式弹性数据集_jar_157

Spark2.X分布式弹性数据集_maven_158

 

 Spark2.X分布式弹性数据集_jar_159