目录
对文件、目录的操作
查看信息
查看文件列表、文件状态、文件位置、节点信息
文件压缩与解压缩
序列化
Sequence File
MapFile
首先,必须运行hadoop,windows中在hadoop的路径下,sbin目录,start-all.cmd,会跳出四个命令行窗口,不要管它,缩小即可。
这个不开启的话,项目无法运行,会报错。还有不要刚开完就运行项目,会进入安全模式,无法正常运行,等一会就好了。
其实hdfs就是在代码中对hadoop服务器上的文件资源进行管理,不用代码直接在命令行中大部分操作也能做,而且有时会更方便。
hadoop fs shell命令官方手册:http://hadoop.apache.org/docs/r2.6.0/hadoop-project-dist/hadoop-common/FileSystemShell.html
hadoop hdfs的常用操作有:
我的项目结构:
上代码。
对文件、目录的操作
package hdfs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.IOUtils;
import java.io.*;
import java.net.URI;
//对集群上的文件/目录的操作
public class FileOperation {
//获取hadoop配置
public Configuration getConf() {
Configuration conf=new Configuration();
return conf;
}
//获取文件系统对象
public FileSystem getHDFS(String path,Configuration conf) throws IOException {
FileSystem hdfs=FileSystem.get(URI.create(path),conf);
return hdfs;
}
//1.上传文件到集群
public String Upload(String src,String desHDFS,FileSystem hdfs){
try{
hdfs.copyFromLocalFile(new Path(src),new Path(desHDFS));
return "上传成功!";
}catch (Exception e){
e.printStackTrace();
return "上传失败!";
}
}
//2.从集群下载文件到本地
public String Download(String srcHDFS,String des,FileSystem hdfs){
try{
hdfs.copyToLocalFile(new Path(srcHDFS),new Path(des));
return "下载成功!";
}catch (Exception e){
e.printStackTrace();
return "下载失败!";
}
}
//3.创建文件
public String CreateFile(byte[] text,String des,FileSystem hdfs) {
FSDataOutputStream out=null;
try{
out=hdfs.create(new Path(des));
out.write(text,0,text.length);
return "创建文件成功!";
}catch (Exception e){
e.printStackTrace();
return "创建文件失败!";
}
finally {
IOUtils.closeStream(out);
}
}
//4.追加文件(src为源文件,inpath为要追加的文件)
// public String Append(String src,String inpath,FileSystem hdfs,Configuration conf){
// InputStream in=null;
// OutputStream out=null;
// try{
// conf.set("dfs.support.append", "true");
// conf.set("dfs.client.block.write.replace-datanode-on-failure.policy", "NEVER");
// conf.set("dfs.client.block.write.replace-datanode-on-failure.enable", "true");
//
// in = new BufferedInputStream(hdfs.open(new Path(inpath)));
// out = hdfs.append(new Path(src));
// IOUtils.copyBytes(in, out, 4096, true);
// return "追加文件成功!";
// }catch (Exception e){
// e.printStackTrace();
// return "追加文件失败!";
// }
// finally {
// IOUtils.closeStream(in);
// IOUtils.closeStream(out);
// }
// }
//修改文件内容
public void ModifyFile(byte[] text,String path,FileSystem hdfs) throws IOException {
//判断该文件是否存在
boolean result=hdfs.exists(new Path(path));
if(result)
{
//删除文件
hdfs.delete(new Path(path));
//创建文件
FSDataOutputStream out=hdfs.create(new Path(path));
out.write(text,0,text.length);
IOUtils.closeStream(out);
}
}
//5.读取文件byte
public byte[] ReadBytes(String file,FileSystem hdfs) throws IOException {
InputStream in=hdfs.open(new Path(file));
byte[] data=ToByteArray(in);
return data;
}
private byte[] ToByteArray(InputStream in) throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
byte[] buffer = new byte[1024 * 4];//4K
int n = 0;
while ((n = in.read(buffer)) != -1) {
out.write(buffer, 0, n);
}
return out.toByteArray();
}
//读取文件内容并打印输出(中文会乱码)
public void ReadFile(String file,FileSystem hdfs,Configuration conf) throws IOException {
InputStream in=hdfs.open(new Path(file));
// BufferedReader bf=new BufferedReader(new InputStreamReader(in));
// String line = null;
// while ((line = bf.readLine()) != null) {
// System.out.println(line);
// }
IOUtils.copyBytes(in,System.out,conf,true);
}
//6.重命名文件
public String Rename(String src,String des,FileSystem hdfs){
try{
boolean result=hdfs.rename(new Path(src),new Path(des));
if(result)
return "重命名文件成功!";
else
return "重命名文件失败!";
}catch (Exception e){
e.printStackTrace();
return "ERROR";
}
}
//7.创建目录
public String CreateDir(String dirpath,FileSystem hdfs){
try{
boolean result=hdfs.mkdirs(new Path(dirpath));
if(result)
return "创建目录成功!";
else
return "创建目录失败!";
}catch (Exception e){
e.printStackTrace();
return "ERROR";
}
}
//8.判断文件/目录是否存在
public String CheckExist(String path,FileSystem hdfs){
try{
boolean result=hdfs.exists(new Path(path));
if(result)
return "文件/目录已存在!";
else
return "文件/目录不存在!";
}catch (Exception e){
e.printStackTrace();
return "ERROR";
}
}
//9.删除文件/目录
public String Delete(String path,FileSystem hdfs){
try{
boolean result=hdfs.delete(new Path(path));
if(result)
return "删除成功!";
else
return "删除失败!";
}catch (Exception e){
e.printStackTrace();
return "ERROR";
}
}
public static void main(String[] args) throws IOException {
FileOperation obj=new FileOperation();
String hdfsRootPath="hdfs://localhost:9000/";
String hdfsData="hdfs://localhost:9000/data";
Configuration conf=obj.getConf();
FileSystem hdfs=obj.getHDFS(hdfsRootPath,conf);
// byte[] text1= obj.readBytes(hdfsData+"/testfile.txt",hdfs);
// byte[] text2=obj.readBytes(hdfsData+"/testfile1.txt",hdfs);
// //合并两个byte数组
// byte[] text=new byte[text1.length+text2.length];
// //源数组,源数组要复制的起始位置,目标数组要粘贴的起始位置,目标数组,要复制的长度。
// System.arraycopy(text1,0,text,0,text1.length);
// System.arraycopy(text2,0,text,text1.length,text2.length);
// obj.ModifyFile(text,hdfsData+"/abc.txt",hdfs);
obj.Delete(hdfsData+"/mapfile1",hdfs);
// obj.Upload("C:/hadoop/data/FileOperation/bigFile.txt",hdfsData+"/bigFile.txt",hdfs);
//
// obj.ReadFile(hdfsData+"/bigFile.txt",hdfs,conf);
}
}
由于是服务器上的文件系统,不能直观地看到变换,所以以上的每步基本都需要在命令提示符中使用“hadoop fs -lsr /”查看更改
查看信息
查看文件列表、文件状态、文件位置、节点信息
package hdfs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
//查看信息:查看文件列表、文件状态、文件位置、节点信息
public class ViewInfos {
public static void main(String[] args) throws IOException, URISyntaxException {
//加载hadoop配置
Configuration conf=new Configuration();
String hdfspath="hdfs://localhost:9000/";
String hdfsData="hdfs://localhost:9000/data";
//创建文件系统对象
FileSystem hdfs=FileSystem.get(URI.create(hdfspath),conf);
/*--------------------1.查看集群上的文件列表-------------------*/
// //调用文件系统的操作方法
// FileStatus[] files=hdfs.listStatus(new Path(hdfsData));
// System.out.println("hdfs Data目录下的文件为:");
// //输出查看hdfs/data目录下的文件
// for(FileStatus file:files)
// System.out.println(file.getPath());
/*------------------2.查看集群上的文件/目录状态-----------------*/
// //调用文件系统的操作方法
// FileStatus[] files=hdfs.listStatus(new Path(hdfsData));
// for(FileStatus file:files)
// System.out.println(file.getPath() + " " +file.getModificationTime());
/*--------------------3.查看集群上的文件位置-------------------*/
// String filePath=hdfsData+"/testfile1.txt";
// FileStatus fileStatus=hdfs.getFileStatus(new Path(filePath));
// //获取文件块
// BlockLocation[] blockLocations=hdfs.getFileBlockLocations(fileStatus,0,fileStatus.getLen());
// //查看文件块位置,以及在哪些集群上
// for(BlockLocation block : blockLocations){
// String[] hosts=block.getHosts();
// for(String host : hosts)
// System.out.println("block:" + block + "; host:" + host);
// }
/*------------------------4.查看节点信息-----------------------*/
//分布式文件系统
DistributedFileSystem distributedHDFS= (DistributedFileSystem) FileSystem.get(new URI(hdfspath),conf);
DatanodeInfo[] datanodeInfos=distributedHDFS.getDataNodeStats();
for(DatanodeInfo datanode : datanodeInfos){
System.out.println("host:"+datanode.getHostName());
System.out.println("blockPoolUsed:"+datanode.getBlockPoolUsed());
}
}
}
文件压缩与解压缩
常见压缩格式有:
代码中使用的是gzip格式进行压缩与解压缩。
package hdfs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.util.ReflectionUtils;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
//文件压缩与解压缩
public class Compress {
//使用GZip格式压缩
public boolean CompressGZ(String src,String des,FileSystem hdfs,Configuration conf){
FSDataInputStream in=null;
FSDataOutputStream fsOut=null;
CompressionOutputStream out=null;
try {
Class <?> codecClass = Class.forName("org.apache.hadoop.io.compress.GzipCodec");
CompressionCodec codec= (CompressionCodec) ReflectionUtils.newInstance(codecClass,conf);
//读取文件
in=hdfs.open(new Path(src));
//创建输出文件
fsOut=hdfs.create(new Path(des));
out = codec.createOutputStream(fsOut);
IOUtils.copyBytes(in, out, conf);
return true;
}catch (Exception e){
e.printStackTrace();
return false;
}finally {
IOUtils.closeStream(in);
IOUtils.closeStream(fsOut);
IOUtils.closeStream(out);
}
}
//使用GZip格式解压,并输出到控制台
public boolean UnCompressGZ(String src,FileSystem hdfs,Configuration conf){
FSDataInputStream in=null;
InputStream inputStream=null;
try {
Class <?> codecClass = Class.forName("org.apache.hadoop.io.compress.GzipCodec");
CompressionCodec codec= (CompressionCodec) ReflectionUtils.newInstance(codecClass,conf);
//读取文件
in=hdfs.open(new Path(src));
inputStream=codec.createInputStream(in);
IOUtils.copyBytes(inputStream, System.out, conf);
return true;
}catch (Exception e){
e.printStackTrace();
return false;
}finally {
IOUtils.closeStream(inputStream);
IOUtils.closeStream(in);
}
}
public static void main(String[] args) throws ClassNotFoundException, IOException {
Configuration conf=new Configuration();
String hdfspath="hdfs://localhost:9000/";
String hdfsData="hdfs://localhost:9000/data";
FileSystem hdfs=FileSystem.get(URI.create(hdfspath),conf);
//指定压缩文件的来源路径及输出路径
String src=hdfsData+"/bigFile.txt";
String des=hdfsData+"/bigFile.txt.gz";
Compress obj=new Compress();
// obj.CompressGZ(src,des,hdfs,conf);
obj.UnCompressGZ(des,hdfs,conf);
}
}
序列化
Sequence File
package hdfs.serialization;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.util.ReflectionUtils;
import java.io.IOException;
import java.net.URI;
import static org.apache.hadoop.io.SequenceFile.createWriter;
//SequenceFile:解决小文件存储,将很多个小文件合并成一个大文件,以key-value的形式进行存储
//sequence file可分割,value可压缩,所占空间更小
public class SequenceFileDemo {
public void ReadSequenceFile(String src,FileSystem hdfs,Configuration conf) throws IOException {
SequenceFile.Reader reader= new SequenceFile.Reader(hdfs, new Path(src), conf);
Writable key= (Writable) ReflectionUtils.newInstance(reader.getKeyClass(),conf);
Writable value= (Writable) ReflectionUtils.newInstance(reader.getValueClass(),conf);
long position=reader.getPosition();
while (reader.next(key,value)){
String syneSeen=reader.syncSeen() ? "*":"";
System.out.println(position+" "+syneSeen+" "+key+" "+value);
position=reader.getPosition();
}
IOUtils.closeStream(reader);
}
public void WriteSequenceFile(String[] data,String src,FileSystem hdfs,Configuration conf) throws IOException {
IntWritable key=new IntWritable();
Text value=new Text();
SequenceFile.Writer writer= SequenceFile.createWriter(hdfs,conf,new Path(src),key.getClass(),value.getClass());
int i=0;
for(String row : data)
{
i++;
key.set(i);
value.set(row);
System.out.println(writer.getLength()+" "+key+" "+value);
writer.append(key,value);
}
IOUtils.closeStream(writer);
}
public static void main(String[] args) throws IOException {
Configuration conf=new Configuration();
String hdfspath="hdfs://localhost:9000/";
FileSystem hdfs=FileSystem.get(URI.create(hdfspath),conf);
String data[]={"One, two, Buckle my shoe","Three, four, Shut the front door","Five, six, Pick up sticks",
"Seven, eight, Lay them straight","Nine, ten, A big fat hen"};
String src=hdfspath+"data/sequence.seq";
SequenceFileDemo obj=new SequenceFileDemo();
// obj.WriteSequenceFile(data,src,hdfs,conf);
obj.ReadSequenceFile(src,hdfs,conf);
}
}
MapFile
package hdfs.serialization;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.*;
import java.io.IOException;
import java.net.URI;
//与Sequence File兼容,类似,目录形式,Index-data(key&value)
public class MapFileDemo {
public void WriteMapFile(String[] data,String src,FileSystem hdfs,Configuration conf) throws IOException {
IntWritable key=new IntWritable();
Text value=new Text();
MapFile.Writer writer= new MapFile.Writer(conf,hdfs,src,key.getClass(),value.getClass());
//每两行计一个index
writer.setIndexInterval(2);
int i=0;
for(String row: data){
i++;
key.set(i);
value.set(row);
System.out.println(i+" "+row);
writer.append(key,value);
}
IOUtils.closeStream(writer);
}
public void ReadMapFile(String src,FileSystem hdfs,Configuration conf) throws IOException {
MapFile.Reader reader=new MapFile.Reader(hdfs,src,conf);
int key=1;
Text value=new Text();
while (reader.next(new IntWritable(key),value)){
System.out.println(key +" "+value);
key++;
}
IOUtils.closeStream(reader);
}
public static void main(String[] args) throws IOException {
Configuration conf=new Configuration();
String hdfspath="hdfs://localhost:9000/";
FileSystem hdfs=FileSystem.get(URI.create(hdfspath),conf);
String data[]={"java,scala,python","hadoop,hive,hbase",
"spark,kafka,strom,sqoop", "spark sql,spark steaming,MLBase,MLlib"};
String src=hdfspath+"data/mapfile";
MapFileDemo obj=new MapFileDemo();
obj.WriteMapFile(data,src,hdfs,conf);
// obj.ReadMapFile(src,hdfs,conf);
}
}