hdfs api 问题 hdfs常见问题及分析

转载

mob64ca140eb362 2024-06-11 08:17:40

文章标签 hdfs api 问题 hdfs hadoop 大数据 apache 文章分类 架构后端开发

文章目录

文件读写
文件上传
文件下载
使用字符流读取数据
删除文件
删除文件夹

自定义数据输入流

文件读写

获取hadoop的系统设置，并在其中创建HDFS文件，文件路径为/user/hadoop/myfile；
在myfile文件中添加字符串；
读取刚刚创建myfile文件中的内容，并输出。

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class hdfs {

	public static void main(String[] args) throws IOException, InterruptedException, URISyntaxException {
		// TODO Auto-generated method stub
		URI uri = new URI("hdfs://hadoop0:8088/");
		//Configuration类实现hadoop各模块之间的值传递
	    Configuration conf = new Configuration();
	    //获取文件系统
	    FileSystem fs=FileSystem.get(uri,conf,"root");
	    //创建文件
	    Path file=new Path("hdfs://hadoop0:8088/user/hadoop/myfile");
	    //获取输出流
	    FSDataOutputStream outStream = fs.create(file);
	    //写入字符
	    outStream.writeUTF("");
	    
	    //获取输入流
	    FSDataInputStream inStream=fs.open(file);
	    //读取文件
	    String data=inStream.readUTF();
	    System.out.print(data);
	    outStream.close();
	}

}

出现java.io.EOFException的错误:

hdfs api 问题 hdfs常见问题及分析_大数据

是因为我把关闭输出流的操作放在了最后一步,因为这个异常虽然是异常，但其实是正常运行结束的标志。EOF表示读到了文件尾( String data = dis.readUTF(); ，客户端已经断开，后面已经没有内容可以读了)，发送结束自然连接也就断开了。hdfs的写入流程如下:

hdfs api 问题 hdfs常见问题及分析_hdfs api 问题_02

1)客户端首先调用DistributedFileSystem对象的create方法，去和namenode建立rpc通信，请求写入数据。
2)namenode收到请求，会进行诸如文件是否存在、用户是否拥有相应权限等一系列的检查。若检查通过, 则为该次上传建立一次记录，并返回给客户端一个FSDataOutputStream对象；检查不通过，返回给客户端异常信息。
3)客户端拿到FSDataOutputStream对象后先对文件进行线性切块，然后会调用FSDataOutputStream对象的write()方法，开始上传第一个block块，上传前会在block的三个副本对应的机器之间建立一条pipeline通
道。每次成功写入一个packet到第一台机器上，三台机器立马通过pipeline进行数据的同步，每次第一台机子通过pipeline成功向后传一个packet，就会放入一个ack packet到应答队列等待应答。
4)每一个block的最后一个packet上传完毕后，datanode给应答队列返回应答信息，然后client会请求
namenode上传下一个block数据，直到所有block上传完毕。
5)上传完毕后，客户端调用FSDataOutputStream对象的close()和flush()关闭刷新输出流，然后通知
namenode上传完毕。

将关闭流的语句移动到完成写入操作之后即可

hdfs api 问题 hdfs常见问题及分析_大数据_03

再次运行,即可成功输出

hdfs api 问题 hdfs常见问题及分析_大数据_04

文件上传

向HDFS中上传文本文件，如果指定的文件在HDFS中已经存在，由用户指定是追加到原有文件末尾还是覆盖原有的文件。在本地目录下的test.txt文件中的内容是: ##hdfs

hdfs api 问题 hdfs常见问题及分析_hadoop_05

代码如下:

import java.io.FileInputStream;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class hdfs_upload {
	/**
	 * 判断路径是否存在
	 */
	public static boolean test(Configuration conf, String path) throws IOException{
		//获取对象
		FileSystem fs = null;
		try {
			fs = FileSystem.get(URI.create("hdfs://hadoop0:8088"),conf,"root");
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		//判断文件是否存在
		return fs.exists(new Path(path));
	}
	
	/**
	 * 复制文件到指定路径
	 * 若文件存在则覆盖
	 */
	public static void copyFromLocalFile(Configuration conf, String localFilePath, String remoteFilePath) throws IOException, InterruptedException{
		//fs.copyFromLocalFile第一个参数表示是否删除源文件，第二个参数表示是否覆盖
		FileSystem fs=FileSystem.get(URI.create("hdfs://hadoop0:8088"),conf,"root");
		fs.copyFromLocalFile(false, true, new Path(localFilePath), new Path(remoteFilePath));
		fs.close();
	}
	
	/**
	 * 追加文件内容
	 */
	public static void appendToFile(Configuration conf, String localFilePath, String remoteFilePath) throws IOException, InterruptedException{
		//获取对象
		FileSystem fs=FileSystem.get(URI.create("hdfs://hadoop0:8088"),conf,"root");
		Path remotePath = new Path(remoteFilePath);
		//创建一个文件读入流
		FileInputStream in=new FileInputStream(localFilePath);
		//创建一个文件输出流,输出的内容将追加到文件末尾
		FSDataOutputStream out = fs.append(remotePath);
		//读写文件内容
		byte[] data=new byte[1024];
		int read=-1;
		while((read=in.read(data))>0) {
			out.write(data,0,read);
		}
		in.close();
		out.close();
	}
	
	public static void main(String[] args)throws IOException  {
		Configuration conf = new Configuration();
        createHDFSFile(conf);
		String localFilePath = "E:\\Users\\cl\\eclipse-workspace\\hadoop\\hdfs\\src\\practice\\file\\test.txt";			// 本地路径
		String remoteFilePath = "hdfs://hadoop0:8088/user/hadoop/text.txt";    // HDFS路径
		String choice = "";
		try {
			/* 判断文件是否存在 */
			Boolean fileExists = false;
			if (hdfs_upload.test(conf, remoteFilePath)) {
				fileExists = true;
				System.out.println(remoteFilePath + " 已存在.");
				//若文件存在则追加到文件末尾
                choice = "append";
			} else {
				System.out.println(remoteFilePath + " 不存在.");
				//覆盖
                choice = "overwrite";
			}
       		//请在此处编写文件不存在则上传 文件choice等于overwrite则覆盖   choice 等于append 则追加的逻辑
        
       		 if (!fileExists) { 
				// 文件不存在，则上传
       			createHDFSFile(conf);
				System.out.println(localFilePath + " 已上传至 " + remoteFilePath);
			} else if (fileExists&&choice.equals("overwrite")) {
				// 选择覆盖
				copyFromLocalFile(conf, localFilePath, remoteFilePath);
				System.out.println(localFilePath + " 已覆盖 " + remoteFilePath);
			} else if (fileExists&&choice.equals("append")) {
				// 选择追加
				appendToFile(conf, localFilePath, remoteFilePath);
				System.out.println(localFilePath + " 已追加至 " + remoteFilePath);
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	private static void createHDFSFile(Configuration conf) throws IOException {
		// TODO Auto-generated method stub
		FileSystem fs = null;
		try {
			fs = FileSystem.get(URI.create("hdfs://hadoop0:8088"),conf,"root");
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}  //获取文件系统
        Path file = new Path("hdfs://hadoop0:8088/user/hadoop/text.txt");        //创建文件   
        FSDataOutputStream outStream = fs.create(file); //获取输出流
        outStream.writeUTF("hello");
        outStream.close();
        fs.close();
	}
}

如出现如下出错,是由于我们没有以指定的身为访问特定的uri来获取文件系统,把所有FileSystem fs=FileSystem.get(conf);改成FileSystem fs=FileSystem.get(URI.create("hdfs://hadoop0:8088"),conf,"root");即可,以root身份访问文件系统防止出现禁止访问的错误

hdfs api 问题 hdfs常见问题及分析_apache_06

修改后重新运行,出现新的错误,这是因为datanode写入失败,而我的集群只有一台,找不到其他可用的datanode节点来写入,所以会报Failed to replace a bad datanode的错误:

hdfs api 问题 hdfs常见问题及分析_hdfs_07

在hdfs-site.xml文件中添加如下几行代码,关闭hdfs,然后重新格式化hdfs,格式化之前先删除hadoop目录下的data和logs文件夹

<property>
	<name>dfs.client.block.write.replace-datanode-on-failure.enable</name>
	<value>true</value>
</property>
<property>
           <name>dfs.client.block.write.replace-datanode-on-failure.policy</name>
           <value>NEVER</value>
</property>

还是出现同样的错误,在appendToFile()方法中添加如下几行代码:

conf.setInt("dsf.replication", 1);
conf.setBoolean("dfs.client.block.write.replace-datanode-on-failure.enable", false);
conf.setBoolean("dfs.support.append",true);

hdfs api 问题 hdfs常见问题及分析_apache_08

然后从新运行,运行成功并且成功将test.txt文件追加到hdfs目录中的/user/hadoop/text.txt文件中

hdfs api 问题 hdfs常见问题及分析_hadoop_09

hdfs api 问题 hdfs常见问题及分析_hdfs api 问题_10

在这里遇见的还有一个问题就是本地路径写成相对路径会找不到文件,所以我写了绝对路径,后续如果我找到了原因再补充.

文件下载

将文件拷贝至本地只需要调用FileSystem中的一个方法即可，如下：

FileSystem fs = FileSystem.get(conf);
Path localPath = new Path(localFilePath);
fs.copyToLocalFile(remotePath, localPath);

完成HDFS中下载文件的功能,代码如下:

import java.io.File;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class hdfs_copyToLocalFile {
	/**
     * 下载文件到本地
     * 判断本地路径是否已存在，若已存在，则自动进行重命名
     */
    public static void copyToLocal(Configuration conf, String remoteFilePath, String localFilePath) throws IOException {
        FileSystem fs = null;
		try {
			fs = FileSystem.get(URI.create("hdfs://hadoop0:8088"),conf,"root");
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
        Path remotePath = new Path(remoteFilePath);
		File f = new File(localFilePath);
		/*在此添加判断文件是否存在的代码，如果文件名存在，自动重命名(在文件名后面加上 _0, _1 ...) */
        if (f.exists()) {
        	System.out.println(localFilePath + " 已存在.");
        	Integer i = 0;
        	while (i>=0) {
        		f = new File(localFilePath+"_"+i);
        		if (!f.exists()) {
        			localFilePath = localFilePath + "_" + i;
        			break;
        		}
        		i++;
        	}
        	System.out.println("将重命名为: " + localFilePath);
        }

        // 将文件下载到本地
		fs.copyToLocalFile(remotePath, new Path(localFilePath));
        fs.close();
    }
    
    /**
	 * 主函数
	 */
	public static void main(String[] args)throws IOException {
		Configuration conf = new Configuration();
		try {
			createHDFSFile(conf);
		} catch (IOException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		} catch (InterruptedException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
		String localFilePath = "E:\\Users\\cl\\eclipse-workspace\\hadoop\\hdfs\\src\\practice\\tmp\\output\\text.txt";    // 本地路径
		String remoteFilePath = "hdfs://hadoop0:8088/user/hadoop/text2.txt";    // HDFS路径
		
		try {
			//调用方法下载至本地
			copyToLocal(conf, remoteFilePath, localFilePath);
			System.out.println("下载完成");
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	
	//创建hdfs文件
	public static void createHDFSFile(Configuration conf)throws IOException, InterruptedException{
        FileSystem fs = FileSystem.get(URI.create("hdfs://hadoop0:8088"),conf,"root");
        //创建文件
        Path file = new Path("hdfs://hadoop0:8088/user/hadoop/text2.txt"); 
        //获取输入流
        FSDataOutputStream outStream = fs.create(file);
        outStream.writeUTF("hello hadoop HDFS ");
        outStream.close();
        fs.close();
    }
}

运行代码,下载完成:

hdfs api 问题 hdfs常见问题及分析_apache_11

使用字符流读取数据

使用字符流读取数据简单来说分为三个步骤：

通过Configuration对象获取FileSystem对象；
通过fs获取FSDataInputStream对象；
通过字符流循环读取文件中数据并输出。

将HDFS中指定文件输出到指定文件中,代码如下:

import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class hdfs_catFile {

	/**
     * 读取文件内容
	 * @throws InterruptedException 
     */
    public static void cat(Configuration conf, String remoteFilePath) throws IOException, InterruptedException {
		//1.读取文件中大的数据
    	FileSystem fs = FileSystem.get(URI.create("hdfs://hadoop0:8088"),conf,"root");
        Path remotePath = new Path(remoteFilePath);
        FSDataInputStream in = fs.open(remotePath);
        BufferedReader d=new BufferedReader(new InputStreamReader(in));
        String line=null;

		//2.将读取到的数据输出到 /tmp/output/text2.txt 文件中
        String localFilePath = "E:\\Users\\cl\\eclipse-workspace\\hadoop\\hdfs\\src\\practice\\tmp\\output\\text3.txt";
        //创建字符流输出对象,并指定输出文件
        FileWriter fileWriter = new FileWriter(localFilePath);
        while((line = d.readLine()) != null){
        	fileWriter.write(line);
        }
        fileWriter.close();
    }
    
	/**
	 * 主函数
	 * @throws InterruptedException 
	 */
	public static void main(String[] args)throws IOException, InterruptedException {
		Configuration conf = new Configuration();
		createHDFSFile(conf);
		String remoteFilePath = "hdfs://hadoop0:8088/user/hadoop/text3.txt";
		
		try {
			System.out.println("读取文件 " + remoteFilePath);
			cat(conf, remoteFilePath);
			System.out.println("\n读取完成");
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	//创建hdfs文件
	public static void createHDFSFile(Configuration conf)throws IOException, InterruptedException{
		//获取系统文件
        FileSystem fs = FileSystem.get(URI.create("hdfs://hadoop0:8088"),conf,"root");
        //创建文件
        Path file = new Path("hdfs://hadoop0:8088/user/hadoop/text3.txt");
        //获取输出流
        FSDataOutputStream outStream = fs.create(file);
        outStream.writeUTF("hello hadoop HDFS step4 www.educoder.net");
        outStream.close();
        fs.close();
    }
}

运行结果:

hdfs api 问题 hdfs常见问题及分析_hdfs_12

删除文件

删除HDFS中指定文件需要使用HDFS Java API中FileSystem的delete()方法。

public boolean delete(Path f, Boolean recursive) 永久性删除指定的文件或目录，如果f是一个空目录或者文件，那么recursive的值就会被忽略。只有recursive＝true时，一个非空目录及其内容才会被删除（即递归删除所有文件）。

删除hdfs中的文件,代码如下:

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class hdfs_rmFile {

	/**
     * 删除文件
     */
    public static boolean rm(Configuration conf, String remoteFilePath) throws IOException, InterruptedException {
    	FileSystem fs = FileSystem.get(URI.create("hdfs://hadoop0:8088"),conf,"root");
        Path remotePath = new Path(remoteFilePath);

		return fs.delete(remotePath, false);
    }
    
	/**
	 * 主函数
	 */
	public static void main(String[] args) {
		Configuration conf = new Configuration();
		String remoteFilePath = "hdfs://hadoop0:8088/user/hadoop/text.txt";
		
		try {
			if (rm(conf, remoteFilePath) ) {
				System.out.println("文件删除: " + remoteFilePath);
			} else {
				System.out.println("操作失败(文件不存在或删除失败)");
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}

运行结果:

hdfs api 问题 hdfs常见问题及分析_hdfs_13

hdfs api 问题 hdfs常见问题及分析_hdfs api 问题_14

删除文件夹

1.验证目录下是否存在文件

使用到的方法public RemoteIterator listFiles(Path f, Boolean recursive)
该方法的作用是：列出给定路径中文件的状态和块位置。如果f是一个目录，recursive是false，则返回目录中的文件；如果recursive是true，则在根目录中返回文件。如果路径是文件，则返回文件的状态和块位置。

FileSystem fs = FileSystem.get(conf);  
Path dirPath = new Path(remoteDir);  
RemoteIterator<LocatedFileStatus> remoteIterator = fs.listFiles(dirPath, true);  
//remoteIterator.hasNext() 会返回一个布尔类型的值，true即代表文件夹不为空，false即代表空。

2. 删除HDFS中的文件或目录

删除HDFS中指定文件需要使用HDFS Java API中FileSystem的delete()方法。

FileSystem fs = FileSystem.get(conf);  
Path remotePath = new Path(remoteFilePath);  
boolean result =  fs.delete(remotePath, false);

删除HDFS中/user/hadoop/tmp目录和/user/hadoop/dir目录，删除前，需要判断两个目录是否为空，若不为空则不删除，否则删除。其中/user/hadoop/tmp目录不为空,/user/hadoop/dir目录为空,

hdfs api 问题 hdfs常见问题及分析_hadoop_15

代码如下:

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;

public class hdfs_rmDir {

	/**
     * 判断目录是否非空
     * true: 空false: 非空
	 * @throws InterruptedException 
     */
    public static boolean isDirEmpty(Configuration conf, String remoteDir) throws IOException, InterruptedException {
    	FileSystem fs = FileSystem.get(URI.create("hdfs://hadoop0:8088"),conf,"root");
        Path dirPath = new Path(remoteDir);
        RemoteIterator<LocatedFileStatus> remoteIterator = fs.listFiles(dirPath, true);

        //remoteIterator.hasNext()返回true标识文件夹不为空,false标识文件夹为空
		return !remoteIterator.hasNext();
    }
	
    /**
     * 删除目录
     * @throws InterruptedException 
     */
    public static boolean rmDir(Configuration conf, String remoteDir, boolean recursive) throws IOException, InterruptedException {
    	FileSystem fs = FileSystem.get(URI.create("hdfs://hadoop0:8088"),conf,"root");
    	Path dirPath = new Path(remoteDir);

		return fs.delete(dirPath, recursive);

    }
    
	public static void main(String[] args) {
		Configuration conf = new Configuration();
		String remoteDir = "hdfs://hadoop0:8088/user/hadoop/dir";
		String remoteDir1 = "hdfs://hadoop0:8088/user/hadoop/tmp";
		//是否强制删除
		Boolean forceDelete = false;
		
		try {
			if ( !isDirEmpty(conf, remoteDir) && !forceDelete ) {
				System.out.println(remoteDir+"目录不为空,不删除");
			} else {
				if ( rmDir(conf, remoteDir, forceDelete) ) {
					System.out.println("目录已删除: " + remoteDir);
				} else {
					System.out.println("操作失败");
				}
			}
            
            if ( !isDirEmpty(conf, remoteDir1) && !forceDelete ) {
				System.out.println(remoteDir1+"目录不为空,不删除");
			} else {
				if ( rmDir(conf, remoteDir1, forceDelete) ) {
					System.out.println("目录已删除: " + remoteDir1);
				} else {
					System.out.println("操作失败");
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}

运行结果

hdfs api 问题 hdfs常见问题及分析_apache_16

自定义数据输入流

public int read(char[] cbuf,int off,int len)throws IOException

此方法实现 Reader 类相应 read 方法的常规协定。另一个便捷之处在于，它将通过重复地调用底层流的 read 方法，尝试读取尽可能多的字符。这种迭代的 read 会一直继续下去，直到满足以下条件之一：已经读取了指定的字符数，底层流的 read 方法返回 -1，指示文件末尾（end-of-file），或者底层流的 ready 方法返回 false，指示将阻塞后续的输入请求。如果第一次对底层流调用 read 返回 -1（指示文件末尾），则此方法返回 -1，否则此方法返回实际读取的字符数。

实现按行读取HDFS中指定文件的方法readLine()，如果读到文件末尾，则返回空，否则返回文件一行的文本，即实现和BufferedReader类的readLine()方法类似的效果。代码如下:

import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class MyFSDataInputStream extends FSDataInputStream {
	public MyFSDataInputStream(InputStream in) {
		super(in);
	}
	
	/**
     * 实现按行读取
     * 每次读入一个字符，遇到"\n"结束，返回一行内容
     */
	public static String readline(BufferedReader br) throws IOException {
		//定义临时的容器,用于存放读取的数据
		StringBuilder sb=new StringBuilder();
		//定义一个字符,用于判断是否读到文件末尾
		int ch=br.read();
		while(ch!=-1) {
			if(ch=='\r') {
				continue;
			}
			if(ch=='\n') {
				return sb.toString();
			}else {
				sb.append((char)ch);
			}
			ch=br.read();
		}
		//判断缓冲区是否还有数据,如果有,就输出
		if(sb.length()!=0) {
			return sb.toString();
		}
		return null;
	}

	/**
     * 读取文件内容
     */
    public static void cat(Configuration conf, String remoteFilePath) throws IOException, InterruptedException {
        FileSystem fs = FileSystem.get(URI.create("hdfs://hadoop0:8088"),conf,"root");
        Path remotePath = new Path(remoteFilePath);
        FSDataInputStream in = fs.open(remotePath);
        BufferedReader br = new BufferedReader(new InputStreamReader(in));
        FileWriter f = new FileWriter("E:\\Users\\cl\\eclipse-workspace\\hadoop\\hdfs\\src\\practice\\tmp\\output\\text4.txt");
        String line = null;
        while ( (line = MyFSDataInputStream.readline(br)) != null ) {
        	f.write(line);
        }
        f.close();
        br.close();
        in.close();
        fs.close();
    }
	
	public static void main(String[] args) {
		Configuration conf = new Configuration();
		String remoteFilePath = "hdfs://hadoop0:8088/user/hadoop/text2.txt";
		try {
			MyFSDataInputStream.cat(conf, remoteFilePath);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}

运行结果:

hdfs api 问题 hdfs常见问题及分析_apache_17