hadoop的环境搭建好之后,本篇博客来使用一下hadoop提供的分布式文件系统(hdfs)的java api。
我做了一个简单的例子,包含文件的读取、写入、删除、创建文件夹、读取文件列表等基本操作。最后会贴出来maven依赖和完整的java代码。
连接到hdfs
只需要通过一个hdfs的uri,即可连接到hdfs。如果连接失败的话,检查一下你的hdfs是否成功启动,以及是不是9000端口。
String uri = "hdfs://localhost:9000";
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
创建文件夹
public boolean mkdirs(Path p) throws IOException
通过org.apache.hadoop.fs.FileSystem.mkdirs()
方法来创建文件夹。下面是示例代码
String folder = "/user/zdk/test-fs";
Path newFolderPath = new Path(folder);
if (!fs.exists(newFolderPath)) {
fs.mkdirs(newFolderPath);
System.out.println("new folder created:" + folder);
}
创建文件
public FSDataOutputStream create(Path p) throws IOException
通过org.apache.hadoop.fs.FileSystem.create()
方法来创建文件。下面是示例代码
String fileName = "file1.txt";
String fileContent = "hello,hadoop!";
Path newFilePath = new Path(newFolderPath + "/" + fileName);
FSDataOutputStream output = fs.create(newFilePath);
output.writeBytes(fileContent);
output.close();
System.out.println("file content write end");
org.apache.hadoop.fs.FileSystem.create()
方法返回一个org.apache.hadoop.fs.FSDataOutputStream
对象,通过这个对象写入数据时,它内部会调用namenode创建元数据,然后将数据写入到datanode中,在这个过程中,数据会被直接复制N份,这个N取决于/etc/hadoop/hdfs-site.xml
中的dfs.replication
值。如果某个datanode在写入数据时发生故障的话,会自动写入另一个datanode中。这个复杂的过程都被封装到了org.apache.hadoop.fs.FSDataOutputStream
中。
读取文件列表
public FileStatus[] listStatus(Path p) throws IOException
这个方法可以列出一个文件夹下的所有文件
public FileStatus[] globStatus(Path pathPattern) throws IOException
这个方法可以列出符合某个pattern的所有文件
下面是示例代码
// list files
FileStatus[] fileStatusList = fs.listStatus(newFolderPath);
// list all text files
//FileStatus[] fileStatusList = fs.globStatus(new Path(folder + "/*.txt"));
int length = fileStatusList.length;
System.out.println("found " + length + " files");
for (FileStatus status : fileStatusList) {
System.out.println(status.getPath().toString());
}
读取文件内容
public FSDataInputStream open(Path p) throws IOException
org.apache.hadoop.fs.FileSystem.open()
方法返回一个org.apache.hadoop.fs.FSDataInputStream
类,这个类会先调用namenode来拿到包含该文件的所有block,对于每个block,namenode会返回多个datanode的地址,这些地址是按照他们与距离的客户端排好序的,客户端会通过最近的一个datanode来获取数据,如果最近的datanode已损坏,则会使用下一个datanode来获取数据。这个复杂的过程被封装在了org.apache.hadoop.fs.FSDataInputStream
类中。
下面是示例代码
FSDataInputStream input = fs.open(newFilePath);
String content= IOUtils.toString(input);
input.close();
删除文件和文件夹
public boolean delete(Path path, boolean recursive) throws IOException
如果path
是一个文件夹的话,recursive
参数必须是true,否则会抛出异常。如果path
是文件的话,recursive
就无所谓true或false了。下面是示例代码
// delete file
fs.delete(newFilePath, false);
// delete folder
fs.delete(newFolderPath, true);
下面我会贴出来完整的maven配置文件和java代码
完整的maven配置文件
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.zdk</groupId>
<artifactId>hadoop-demo</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>hadoop-demo</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<hadoop.version>2.8.1</hadoop.version>
<log4j.version>1.2.17</log4j.version>
<commons-io.version>1.3.2</commons-io.version>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<!-- Hadoop main client artifact -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-io</artifactId>
<version>${commons-io.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>io.saagie</groupId>
<artifactId>saagie-maven-plugin</artifactId>
<version>1.0.2</version>
<configuration>
<platformId>1</platformId>
<jobName>example-java-read-and-write-from-hdfs</jobName>
<jobCategory>extract</jobCategory>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>io.saagie.example.hdfs.Main</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id> <!-- this is used for inheritance merges -->
<phase>package</phase> <!-- bind to the packaging phase -->
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
完整的java代码
package org.zdk.hadoop_demo;
import java.net.URI;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class App {
public static void main(String[] args) throws Exception {
String uri = "hdfs://localhost:9000";
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
// create new folder
String folder = "/user/zdk/test-fs";
Path newFolderPath = new Path(folder);
if (!fs.exists(newFolderPath)) {
fs.mkdirs(newFolderPath);
System.out.println("new folder created:" + folder);
}
// create new file and write content
String fileName = "file1.txt";
String fileContent = "hello,hadoop!";
Path newFilePath = new Path(newFolderPath + "/" + fileName);
FSDataOutputStream output = fs.create(newFilePath);
output.writeBytes(fileContent);
output.close();
System.out.println("file content write end");
// list files
FileStatus[] fileStatusList = fs.listStatus(newFolderPath);
// list all text files
//FileStatus[] fileStatusList = fs.globStatus(new Path(folder + "/*.txt"));
int length = fileStatusList.length;
System.out.println("found " + length + " files");
for (FileStatus status : fileStatusList) {
System.out.println(status.getPath().toString());
}
// read file content
FSDataInputStream input = fs.open(newFilePath);
String content = IOUtils.toString(input);
System.out.println(content);
input.close();
// delete file
fs.delete(newFilePath, false);
// delete folder
fs.delete(newFolderPath, true);
// close fileSystem
fs.close();
}
}