我用的是spark-2.3.0-bin-hadoop2.7.tar这版本的。

下面我来说一下的操作步走,想使用spark我们肯定要安装它

第一步:

在你的虚拟机中输入pip install pyspark如果出现下图说明证明




Linux配置spark并连接Hadoop linux启动spark_spark

Linux配置spark并连接Hadoop linux启动spark_文本文件_02

第二步:

如果你有安装了XFT这个插件就用你的这个插件把你的pyspark这个压缩包放进去



Linux配置spark并连接Hadoop linux启动spark_spark_03

第三步:

我们解压这个文件tar -xvf  spark-2.3.0-bin-hadoop2.7.tar -C opt

Linux配置spark并连接Hadoop linux启动spark_Desktop_04

第四步:

我们配置一下他的环境变量就可以了 在你虚拟机中 输入这个命令在你的工作目录(~)vi ~/.bashrc然后退出这个界面然后刷新source ~/.bashrc

Linux配置spark并连接Hadoop linux启动spark_spark_05

第五步:

我们把这个做好了,就可以来写着

1.#导入模块
import pyspark
#导入类
from pyspark import SparkContext,SparkConf
#创建配置,指定AppName,指定Master(主机)
conf = SparkConf(
        ).setAppName('demoRDD'
        ).setMaster('local[*]')


#创建会话
sc = SparkContext.getOrCreate(conf)
#通过会话实现对SPARK的操作
#以python list提供一个测试用的数据
data = [x for x in range(11)]
rdd = sc.parallelize(data)
print('RDD对象:',rdd,'\n记录数:',rdd.count())
partiRdd = rdd.glom()
print('RDD.GLOM Collect结果:',partiRdd.collect())
print(partiRdd.count())
print('RDD Collect结果:',rdd.collect())
#关闭会话sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
2.#换一个写法
#map操作
#filter操作
#创建会话
sc = SparkContext.getOrCreate(conf)
#通过会话实现对SPARK的操作
#以python list提供一个测试用的数据data = [x for x in range(11)]
rdd = sc.parallelize(data) 

print('RDD对象记录数:',rdd.count()) 

print('RDD Collect结果:\n',rdd.collect()) 

#判断一个数是否是偶数 

def filterOdd(x): 

    return x%2 == 0 

#map操作,映射 

#过滤操作 

listA = rdd.map(lambda x:x**3 

            ).filter(filterOdd 

            ).collect() 

print(listA) 

print(rdd.collect()) 

#关闭会话 

sc.stop() 

----------------------------------------------------------------------------------------------------------------------------------
3.#使用文本文件做数据源
sc = SparkContext.getOrCreate(conf)
rows = sc.textFile("file:///Users/chuzhengkai/Desktop/test.txt")#虚拟机文件地址
print(rows.first())
print(rows.take(2))
print(rows.count())
print(rows.top(2))sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
#使用多个文本文件
#进行词频统计 

sc = SparkContext.getOrCreate(conf) 

#多个文本文件获取到一个RDD里面 

filesRDD = sc.wholeTextFiles('file:///Users/chuzhengkai/Desktop/*.txt') 

#文件内容RDD 

fileConRDD = filesRDD.map(lambda x:x[1]) 

#用回车符分隔字符串,形成列表 

def sp(x): 

    return x.split('\n') 

#对每个文件内容做映射,结果是多个文件内容列表 

#存在二维结构 

strRDD = fileConRDD.map(sp) 

#同样是映射,结果展平成一维结构 

wordRDD = fileConRDD.flatMap(sp) 

#结果,形成类一个元组表达一个文件,多个元组的列表 

#词频统计map 

wordDictRDD = wordRDD.map(lambda x:(x,1)) 



#Reduce 

r = wordDictRDD.reduceByKey(lambda x,y:x+y) 



#print(strRDD.collect()) 

#print(wordRDD.collect()) 

#print(wordDictRDD.collect()) 

print(r.collect()) 

sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
from pyspark import SparkContext,SparkConf


conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#用 python list 初始化
data =[1,2,3,4,5,6,7,8,9]
rdd = sc.parallelize(data)
print(rdd.collect())
print(rdd.getNumPartitions())
print(rdd.glom().collect())
print(rdd.first())sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)


#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test.txt")
print(rdd.collect())
print(rdd.getNumPartitions())
print(rdd.glom().collect())
#执行Map运算
rdda=rdd.map(lambda x:len(x))
print(rdda.collect())
sc.stop()----------------------------------------------------------------------------------------------------------------------------------
 

  conf=SparkConf().setAppName('demoPrj').setMaster('local[*]') 
 
sc = SparkContext.getOrCreate(conf) 
 


#读取所有文本文件 
 
rdd = sc.wholeTextFiles("file:///Users/chuzhengkai/Desktop/*.txt") 
 
a=rdd.collect() 
 
for b in a: 
 
    #b是一个元组(文件名,内容) 
 
    print(b[0]) 
 
sc.stop() 
 

  ---------------------------------------------------------------------------------------------------------------------------------- 

 
#变换
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#用 python list 初始化
data =[x for x in range(10)]
rdd = sc.parallelize(data)
print(rdd.collect())


def pf(x):    return x**2
----------------------------------------------------------------------------------------------------------------------------------
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]') 

sc = SparkContext.getOrCreate(conf) 

#用 python list 初始化 

data =['Hello guys','My name is David','Welcome to USA'] 

rdd = sc.parallelize(data) 

print(rdd.collect()) 



def fg(x): 

    return x.split(' ') 



#map操作 

xrdd = rdd.map(fg) 

print(xrdd.collect()) 



#flatMap 

xrdd = rdd.flatMap(fg) 

print(xrdd.collect()) 

sc.stop() 


def ou(x): 

    return x%2==0 

#map运算 

xrdd = rdd.map(pf) 

print(xrdd.collect()) 

#filter运算 

xrdd = rdd.map(pf).filter(ou) 

print(xrdd.collect()) 

sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
#词频统计
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)


#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test0.txt")
print(rdd.collect())


def wmap(x):
    return (x,1)


def wreduce(x,y):
    return x+y


#执行Map运算
rddm=rdd.map(wmap)
print(rddm.collect())
#执行reduce运算
rddr=rddm.reduceByKey(wreduce)
print(rddr.collect())
sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
#去重复
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)


#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test0.txt")
print(rdd.collect())
print(rdd.distinct().collect())sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
#排序
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)


#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test0.txt")
print(rdd.collect())


def wmap(x):
    return (x,1)


def wreduce(x,y):
    return x+y


def px(x):
    return x[1]


#执行Map运算
rddm=rdd.map(wmap)
print(rddm.collect())


#执行reduce运算
rddr=rddm.reduceByKey(wreduce)
print(rddr.collect())


#对结果执行按词频排序
rdds=rddr.sortBy(px,ascending=False)
print(rdds.collectAsMap())


#取前三名
print(rdds.take(3))


#按key排序
print(rddr.sortByKey().collect())


#按key分组
print(rddm.groupByKey().map(lambda x:{x[0]:[y for y in x[1]]}).collect())sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
from pyspark import SparkContext,SparkConf


conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#用 python list 初始化
data =[1,2,3,4,5,6,7,8,9]
rdd = sc.parallelize(data)
print(rdd.collect())
print(rdd.getNumPartitions())
print(rdd.glom().collect())
print(rdd.first())sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)


#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test.txt")
print(rdd.collect())
print(rdd.getNumPartitions())
print(rdd.glom().collect())
#执行Map运算
rdda=rdd.map(lambda x:len(x))
print(rdda.collect())sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)


#读取所有文本文件
rdd = sc.wholeTextFiles("file:///Users/chuzhengkai/Desktop/*.txt")
a=rdd.collect()
for b in a:
    #b是一个元组(文件名,内容)
    print(b[0])sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
#变换
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#用 python list 初始化
data =[x for x in range(10)]
rdd = sc.parallelize(data)
print(rdd.collect())


def pf(x):
    return x**2


def ou(x):
    return x%2==0
#map运算
xrdd = rdd.map(pf)
print(xrdd.collect())
#filter运算
xrdd = rdd.map(pf).filter(ou)
print(xrdd.collect())sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#用 python list 初始化
data =['Hello guys','My name is David','Welcome to USA']
rdd = sc.parallelize(data)
print(rdd.collect())


def fg(x):
    return x.split(' ')


#map操作
xrdd = rdd.map(fg)
print(xrdd.collect())


#flatMap
xrdd = rdd.flatMap(fg)
print(xrdd.collect())sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
#词频统计
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)


#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test0.txt")
print(rdd.collect())


def wmap(x):
    return (x,1)


def wreduce(x,y):
    return x+y


#执行Map运算
rddm=rdd.map(wmap)
print(rddm.collect())
#执行reduce运算
rddr=rddm.reduceByKey(wreduce)
print(rddr.collect())
sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
#去重复
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)


#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test0.txt")
print(rdd.collect())print(rdd.distinct().collect())
----------------------------------------------------------------------------------------------------------------------------------
#排序
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)


#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test0.txt")
print(rdd.collect())


def wmap(x):
    return (x,1)


def wreduce(x,y):
    return x+y


def px(x):
    return x[1]


#执行Map运算
rddm=rdd.map(wmap)
print(rddm.collect())


#执行reduce运算
rddr=rddm.reduceByKey(wreduce)
print(rddr.collect())


#对结果执行按词频排序
rdds=rddr.sortBy(px,ascending=False)
print(rdds.collectAsMap())


#取前三名
print(rdds.take(3))


#按key排序
print(rddr.sortByKey().collect())


#按key分组
print(rddm.groupByKey().map(lambda x:{x[0]:[y for y in x[1]]}).collect())
sc.stop()