我用的是spark-2.3.0-bin-hadoop2.7.tar这版本的。
下面我来说一下的操作步走,想使用spark我们肯定要安装它
第一步:
在你的虚拟机中输入pip install pyspark如果出现下图说明证明
第二步:
如果你有安装了XFT这个插件就用你的这个插件把你的pyspark这个压缩包放进去
第三步:
我们解压这个文件tar -xvf spark-2.3.0-bin-hadoop2.7.tar -C opt
第四步:
我们配置一下他的环境变量就可以了 在你虚拟机中 输入这个命令在你的工作目录(~)vi ~/.bashrc然后退出这个界面然后刷新source ~/.bashrc
第五步:
我们把这个做好了,就可以来写着
1.#导入模块
import pyspark
#导入类
from pyspark import SparkContext,SparkConf
#创建配置,指定AppName,指定Master(主机)
conf = SparkConf(
).setAppName('demoRDD'
).setMaster('local[*]')
#创建会话
sc = SparkContext.getOrCreate(conf)
#通过会话实现对SPARK的操作
#以python list提供一个测试用的数据
data = [x for x in range(11)]
rdd = sc.parallelize(data)
print('RDD对象:',rdd,'\n记录数:',rdd.count())
partiRdd = rdd.glom()
print('RDD.GLOM Collect结果:',partiRdd.collect())
print(partiRdd.count())
print('RDD Collect结果:',rdd.collect())
#关闭会话sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
2.#换一个写法
#map操作
#filter操作
#创建会话
sc = SparkContext.getOrCreate(conf)
#通过会话实现对SPARK的操作
#以python list提供一个测试用的数据data = [x for x in range(11)]
rdd = sc.parallelize(data)
print('RDD对象记录数:',rdd.count())
print('RDD Collect结果:\n',rdd.collect())
#判断一个数是否是偶数
def filterOdd(x):
return x%2 == 0
#map操作,映射
#过滤操作
listA = rdd.map(lambda x:x**3
).filter(filterOdd
).collect()
print(listA)
print(rdd.collect())
#关闭会话
sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
3.#使用文本文件做数据源
sc = SparkContext.getOrCreate(conf)
rows = sc.textFile("file:///Users/chuzhengkai/Desktop/test.txt")#虚拟机文件地址
print(rows.first())
print(rows.take(2))
print(rows.count())
print(rows.top(2))sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
#使用多个文本文件
#进行词频统计
sc = SparkContext.getOrCreate(conf)
#多个文本文件获取到一个RDD里面
filesRDD = sc.wholeTextFiles('file:///Users/chuzhengkai/Desktop/*.txt')
#文件内容RDD
fileConRDD = filesRDD.map(lambda x:x[1])
#用回车符分隔字符串,形成列表
def sp(x):
return x.split('\n')
#对每个文件内容做映射,结果是多个文件内容列表
#存在二维结构
strRDD = fileConRDD.map(sp)
#同样是映射,结果展平成一维结构
wordRDD = fileConRDD.flatMap(sp)
#结果,形成类一个元组表达一个文件,多个元组的列表
#词频统计map
wordDictRDD = wordRDD.map(lambda x:(x,1))
#Reduce
r = wordDictRDD.reduceByKey(lambda x,y:x+y)
#print(strRDD.collect())
#print(wordRDD.collect())
#print(wordDictRDD.collect())
print(r.collect())
sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
from pyspark import SparkContext,SparkConf
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#用 python list 初始化
data =[1,2,3,4,5,6,7,8,9]
rdd = sc.parallelize(data)
print(rdd.collect())
print(rdd.getNumPartitions())
print(rdd.glom().collect())
print(rdd.first())sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test.txt")
print(rdd.collect())
print(rdd.getNumPartitions())
print(rdd.glom().collect())
#执行Map运算
rdda=rdd.map(lambda x:len(x))
print(rdda.collect())
sc.stop()----------------------------------------------------------------------------------------------------------------------------------
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#读取所有文本文件
rdd = sc.wholeTextFiles("file:///Users/chuzhengkai/Desktop/*.txt")
a=rdd.collect()
for b in a:
#b是一个元组(文件名,内容)
print(b[0])
sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
#变换
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#用 python list 初始化
data =[x for x in range(10)]
rdd = sc.parallelize(data)
print(rdd.collect())
def pf(x): return x**2
----------------------------------------------------------------------------------------------------------------------------------
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#用 python list 初始化
data =['Hello guys','My name is David','Welcome to USA']
rdd = sc.parallelize(data)
print(rdd.collect())
def fg(x):
return x.split(' ')
#map操作
xrdd = rdd.map(fg)
print(xrdd.collect())
#flatMap
xrdd = rdd.flatMap(fg)
print(xrdd.collect())
sc.stop()
def ou(x):
return x%2==0
#map运算
xrdd = rdd.map(pf)
print(xrdd.collect())
#filter运算
xrdd = rdd.map(pf).filter(ou)
print(xrdd.collect())
sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
#词频统计
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test0.txt")
print(rdd.collect())
def wmap(x):
return (x,1)
def wreduce(x,y):
return x+y
#执行Map运算
rddm=rdd.map(wmap)
print(rddm.collect())
#执行reduce运算
rddr=rddm.reduceByKey(wreduce)
print(rddr.collect())
sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
#去重复
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test0.txt")
print(rdd.collect())
print(rdd.distinct().collect())sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
#排序
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test0.txt")
print(rdd.collect())
def wmap(x):
return (x,1)
def wreduce(x,y):
return x+y
def px(x):
return x[1]
#执行Map运算
rddm=rdd.map(wmap)
print(rddm.collect())
#执行reduce运算
rddr=rddm.reduceByKey(wreduce)
print(rddr.collect())
#对结果执行按词频排序
rdds=rddr.sortBy(px,ascending=False)
print(rdds.collectAsMap())
#取前三名
print(rdds.take(3))
#按key排序
print(rddr.sortByKey().collect())
#按key分组
print(rddm.groupByKey().map(lambda x:{x[0]:[y for y in x[1]]}).collect())sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
from pyspark import SparkContext,SparkConf
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#用 python list 初始化
data =[1,2,3,4,5,6,7,8,9]
rdd = sc.parallelize(data)
print(rdd.collect())
print(rdd.getNumPartitions())
print(rdd.glom().collect())
print(rdd.first())sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test.txt")
print(rdd.collect())
print(rdd.getNumPartitions())
print(rdd.glom().collect())
#执行Map运算
rdda=rdd.map(lambda x:len(x))
print(rdda.collect())sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#读取所有文本文件
rdd = sc.wholeTextFiles("file:///Users/chuzhengkai/Desktop/*.txt")
a=rdd.collect()
for b in a:
#b是一个元组(文件名,内容)
print(b[0])sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
#变换
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#用 python list 初始化
data =[x for x in range(10)]
rdd = sc.parallelize(data)
print(rdd.collect())
def pf(x):
return x**2
def ou(x):
return x%2==0
#map运算
xrdd = rdd.map(pf)
print(xrdd.collect())
#filter运算
xrdd = rdd.map(pf).filter(ou)
print(xrdd.collect())sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#用 python list 初始化
data =['Hello guys','My name is David','Welcome to USA']
rdd = sc.parallelize(data)
print(rdd.collect())
def fg(x):
return x.split(' ')
#map操作
xrdd = rdd.map(fg)
print(xrdd.collect())
#flatMap
xrdd = rdd.flatMap(fg)
print(xrdd.collect())sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
#词频统计
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test0.txt")
print(rdd.collect())
def wmap(x):
return (x,1)
def wreduce(x,y):
return x+y
#执行Map运算
rddm=rdd.map(wmap)
print(rddm.collect())
#执行reduce运算
rddr=rddm.reduceByKey(wreduce)
print(rddr.collect())
sc.stop()
----------------------------------------------------------------------------------------------------------------------------------
#去重复
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test0.txt")
print(rdd.collect())print(rdd.distinct().collect())
----------------------------------------------------------------------------------------------------------------------------------
#排序
conf=SparkConf().setAppName('demoPrj').setMaster('local[*]')
sc = SparkContext.getOrCreate(conf)
#读取一个文本文件
rdd = sc.textFile("file:///Users/chuzhengkai/Desktop/test0.txt")
print(rdd.collect())
def wmap(x):
return (x,1)
def wreduce(x,y):
return x+y
def px(x):
return x[1]
#执行Map运算
rddm=rdd.map(wmap)
print(rddm.collect())
#执行reduce运算
rddr=rddm.reduceByKey(wreduce)
print(rddr.collect())
#对结果执行按词频排序
rdds=rddr.sortBy(px,ascending=False)
print(rdds.collectAsMap())
#取前三名
print(rdds.take(3))
#按key排序
print(rddr.sortByKey().collect())
#按key分组
print(rddm.groupByKey().map(lambda x:{x[0]:[y for y in x[1]]}).collect())
sc.stop()