hadoop的环境搭建好之后,本篇博客来使用一下hadoop提供的分布式文件系统(hdfs)的java api。
我做了一个简单的例子,包含文件的读取、写入、删除、创建文件夹、读取文件列表等基本操作。最后会贴出来maven依赖和完整的java代码。

连接到hdfs

只需要通过一个hdfs的uri,即可连接到hdfs。如果连接失败的话,检查一下你的hdfs是否成功启动,以及是不是9000端口。

String uri = "hdfs://localhost:9000";
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);

创建文件夹

public boolean mkdirs(Path p) throws IOException

通过org.apache.hadoop.fs.FileSystem.mkdirs()方法来创建文件夹。下面是示例代码

String folder = "/user/zdk/test-fs";
Path newFolderPath = new Path(folder);
if (!fs.exists(newFolderPath)) {
    fs.mkdirs(newFolderPath);
    System.out.println("new folder created:" + folder);
}

创建文件

public FSDataOutputStream create(Path p) throws IOException

通过org.apache.hadoop.fs.FileSystem.create()方法来创建文件。下面是示例代码

String fileName = "file1.txt";
String fileContent = "hello,hadoop!";
Path newFilePath = new Path(newFolderPath + "/" + fileName);
FSDataOutputStream output = fs.create(newFilePath);
output.writeBytes(fileContent);
output.close();
System.out.println("file content write end");

org.apache.hadoop.fs.FileSystem.create()方法返回一个org.apache.hadoop.fs.FSDataOutputStream对象,通过这个对象写入数据时,它内部会调用namenode创建元数据,然后将数据写入到datanode中,在这个过程中,数据会被直接复制N份,这个N取决于/etc/hadoop/hdfs-site.xml中的dfs.replication值。如果某个datanode在写入数据时发生故障的话,会自动写入另一个datanode中。这个复杂的过程都被封装到了org.apache.hadoop.fs.FSDataOutputStream中。

读取文件列表

public FileStatus[] listStatus(Path p) throws IOException

这个方法可以列出一个文件夹下的所有文件

public FileStatus[] globStatus(Path pathPattern) throws IOException

这个方法可以列出符合某个pattern的所有文件

下面是示例代码

// list files
FileStatus[] fileStatusList = fs.listStatus(newFolderPath);
// list all text files
//FileStatus[] fileStatusList = fs.globStatus(new Path(folder + "/*.txt"));
int length = fileStatusList.length;
System.out.println("found " + length + " files");
for (FileStatus status : fileStatusList) {
    System.out.println(status.getPath().toString());
}

读取文件内容

public FSDataInputStream open(Path p) throws IOException

org.apache.hadoop.fs.FileSystem.open()方法返回一个org.apache.hadoop.fs.FSDataInputStream类,这个类会先调用namenode来拿到包含该文件的所有block,对于每个block,namenode会返回多个datanode的地址,这些地址是按照他们与距离的客户端排好序的,客户端会通过最近的一个datanode来获取数据,如果最近的datanode已损坏,则会使用下一个datanode来获取数据。这个复杂的过程被封装在了org.apache.hadoop.fs.FSDataInputStream类中。

下面是示例代码

FSDataInputStream input = fs.open(newFilePath);
String content= IOUtils.toString(input);
input.close();

删除文件和文件夹

public boolean delete(Path path, boolean recursive) throws IOException

如果path是一个文件夹的话,recursive参数必须是true,否则会抛出异常。如果path是文件的话,recursive就无所谓true或false了。下面是示例代码

// delete file
fs.delete(newFilePath, false);

// delete folder
fs.delete(newFolderPath, true);

下面我会贴出来完整的maven配置文件和java代码

完整的maven配置文件

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.zdk</groupId>
    <artifactId>hadoop-demo</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <packaging>jar</packaging>

    <name>hadoop-demo</name>
    <url>http://maven.apache.org</url>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <hadoop.version>2.8.1</hadoop.version>
        <log4j.version>1.2.17</log4j.version>
        <commons-io.version>1.3.2</commons-io.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>3.8.1</version>
            <scope>test</scope>
        </dependency>
        <!-- Hadoop main client artifact -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-io</artifactId>
            <version>${commons-io.version}</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>io.saagie</groupId>
                <artifactId>saagie-maven-plugin</artifactId>
                <version>1.0.2</version>
                <configuration>
                    <platformId>1</platformId>
                    <jobName>example-java-read-and-write-from-hdfs</jobName>
                    <jobCategory>extract</jobCategory>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <archive>
                        <manifest>
                            <mainClass>io.saagie.example.hdfs.Main</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id> <!-- this is used for inheritance merges -->
                        <phase>package</phase> <!-- bind to the packaging phase -->
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

完整的java代码

package org.zdk.hadoop_demo;

import java.net.URI;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class App {
    public static void main(String[] args) throws Exception {
        String uri = "hdfs://localhost:9000";
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(URI.create(uri), conf);

        // create new folder
        String folder = "/user/zdk/test-fs";
        Path newFolderPath = new Path(folder);
        if (!fs.exists(newFolderPath)) {
            fs.mkdirs(newFolderPath);
            System.out.println("new folder created:" + folder);
        }

        // create new file and write content
        String fileName = "file1.txt";
        String fileContent = "hello,hadoop!";
        Path newFilePath = new Path(newFolderPath + "/" + fileName);
        FSDataOutputStream output = fs.create(newFilePath);
        output.writeBytes(fileContent);
        output.close();
        System.out.println("file content write end");

        // list files
        FileStatus[] fileStatusList = fs.listStatus(newFolderPath);
        // list all text files
        //FileStatus[] fileStatusList = fs.globStatus(new Path(folder + "/*.txt"));
        int length = fileStatusList.length;
        System.out.println("found " + length + " files");
        for (FileStatus status : fileStatusList) {
            System.out.println(status.getPath().toString());
        }

        // read file content
        FSDataInputStream input = fs.open(newFilePath);
        String content = IOUtils.toString(input);
        System.out.println(content);
        input.close();

        // delete file
        fs.delete(newFilePath, false);

        // delete folder
        fs.delete(newFolderPath, true);

        // close fileSystem
        fs.close();

    }
}