Scala基本方法使用

1.代码如下:

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.storage.StorageLevel

import scala.collection.JavaConverters._

object Test {
def errorAndWarning(args:Array[String]): Unit ={
//定义一个方法,求出该文件中包含error或者warning的行

val conf = new SparkConf().setAppName("MyTest").setMaster("local")
val sc = new SparkContext(conf)

/*
1.以下程序在scala中即可运行
2.因为spark-shell在初始化的时候,已经自定义了一个sc
*/
val inputRDD = sc.textFile("/root/log_test.txt")
//
val outputRDD = inputRDD.filter(line => line.contains("error") && line.contains("warning"))


val result = sc.parallelize(Array(1,2,3,4,5))
result.persist(StorageLevel.DISK_ONLY)

val lines = sc.parallelize(Array("Hello Spark","Hello hadoop"))
val word = Array("Hello Spark","Hello hadoop")

val result1 = lines.map(_.split(",")(0))
}

def main(args:Array[String]): Unit = {
val conf = new SparkConf().setAppName("test").setMaster("local")
val sc = new SparkContext(conf)

/**
* 1.test aggregate()()
* because the scala hard to understand,so I explain it with Chinese
* Step1: 使用parallelize方法,获取的一个rdd。并且这个rdd有两个分区。
* Step2: 对获得的RDD使用aggregate()(),这里的x代表的是第一个括号中的“”,其字符串长度为0,y代表的是第一个分区中的最大长度,可以知道为2.
* Step3:后面的(x,y)=>x+y指的是将前面操作得到的两个值(2,4)再操作,这里进行的操作时字符串的追加。故最后结果为字符串“24”
* /
val rdd3 = sc.parallelize(List("12", "23", "345", "4567"), 2)
val result = rdd3.aggregate("")((x, y) => math.max(x.length, y.length).toString, (x, y) => x + y)
println(result)

/* step 1:the x is the initial value 1. y.sum is the sum of local List
* step 2:parameter m,n is the global List
* step 3:the value one only Accumulate one
* step 4:the result is 21
*/
var arr = List(List(1,2),List(3,4,5),List(5,1))
var res1 = arr.aggregate(1)((x,y)=>x+y.sum,(m,n)=>m+n)

/**
* 2.fun2是一个自定义方法,参数是Int型的index和Iterator型的String数组
*/
def fun2(index:Int,iter:Iterator[(String)]):Iterator[String]={
iter.toList.map(x=>"[partID:"+index+",val:"+x+"]").iterator//转换为List,再转换为Iterator
}

def fun3(index:Int,iter:Iterator[(String,Int)] ):Iterator[String]={
iter.toList.map( x=> "[partID:"+index+",val:"+x+"]").iterator
}

/**
* 1.mapPartitionsWithIndex?
*/
rdd3.mapPartitionsWithIndex(fun2)


/**
* 3.
* 01要注意的是:操作数类型和返回后的类型必须相同
* 【我的理解是,list是一个List集合,这个集合中存放的是String,但是在你操作之后,还是这个List,所以你还应该在这个List中存放String】
*/
val list = List("345","1","23")
//find the smaller length between the two
val rddList: String = list.reduce((x, y)=>math.min(x.length,y.length).toString)
println("rddList in Example3 is : "+rddList)

/**
* 4.Notice the following two statements
val str1 = "".length//the result is 0
val str2 = str1.toString //"0".toString
*/
val rdd4 = sc.parallelize(List("12","23","345","4567"),2)
val str: String = rdd4.aggregate("")((x, y)=>math.min(x.length,y.length).toString, (x, y)=> x+y)
//the result is suprized
print("str in Example4 is : "+str)

val rdd1 = sc.parallelize(1 to 10 ,2)//get 2 partitions
rdd1.partitions.length

val rdd2 = rdd1.repartition(3)
rdd2.partitions.length//set partition to 3

val rdd5 = rdd1.coalesce(3)//even you use coalesce,but the number of partitions don't change
rdd5.partitions.length

val pairRDD = sc.parallelize(Array(("cat", 2), ("cat", 5),("mouse", 4), ("dog", 6)),2)
val resultPair: RDD[(String, Int)] = pairRDD.aggregateByKey(0)(_+_,_+_)
resultPair.collect.foreach(println)
//aggregateByKey()是作用在key上的函数
}

def chapater8 :Unit={
val conf = new SparkConf()
conf.set("spark.app.name","My Spark App")
conf.set("spark.master","local")
//setMaster() and setAppNmae() both called set method in bottom layer.
//conf.set("spark.ui.port","36000")//reset port
val sc = new SparkContext(conf)
}
}