今天无聊,想做个公众号点播歌曲功能,所以去某网站爬了303222首音乐,音乐有效率较高,爬了8个小时还没爬完,我就不爬了。
两个方法小功能有点不一样,所以写的有的显得多余,不管这么多了,能爬就行,其实我的很多代码都是通过分析原网站的内容,url来写的,所以你可能要去分析一下网站结构,url跳转情况,才看得懂。源代码如下:Music是个bean
public class MusicURl { // 303222 // static QueryRunner runner = new QueryRunner(TransactionManagerUtil2.getSource()); // http://www.333ttt.com // http://www.333ttt.com/up/?page=5 page 1-30000 一个page一首歌 // <a target="h" href="/up/up2103526739.html">耿耿星河-Mc神五精华版.mp3</a></td> public static void main(String[] args) throws Exception{ int sum=0; for(int i=1;i<=30000;i++){ String allStr=getURLContent1("http://www.333ttt.com/up/?page="+i,"utf-8"); List<String>urls=getMatcherSubstrs(allStr,"<a[^>]*href=\"(?<href>[^\"]*)\"[^>]*>"); for (String string : urls) { String str=getURLContent("http://www.333ttt.com/"+string,"utf-8"); String url=getMatcherSubstrs(str,"http://[\\w/.]+\\.mp3",0); String name=getMatcherSubstrs(str,"<a[^<>]*?>\\s*(.*?)\\s*</a>",1); Music music=new Music(url, name); String json=JSON.toJSONString(music); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File("d:/musicurl.txt"), true))); out.write(json+"\r\n"); out.close(); sum++; System.out.println("第"+sum+"个:"+json); // String sql = "insert into musicurl values(null,?,?)"; // runner.update(sql, name, url); } } } /** * * @param url地址 * @param 字符级 * @return 网页源码 */ public static String getURLContent(String urlStr,String charSet){ StringBuilder sb=new StringBuilder(); try { URL url=new URL(urlStr); BufferedReader reader=new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName(charSet))); String temp=""; while((temp=reader.readLine())!=null){ if(temp.contains("音乐名称")) sb.append(temp+"\r\n"); } } catch (IOException e) { e.printStackTrace(); } return sb.toString(); } public static String getURLContent1(String urlStr,String charSet){ StringBuilder sb=new StringBuilder(); try { URL url=new URL(urlStr); BufferedReader reader=new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName(charSet))); String temp=""; while((temp=reader.readLine())!=null){ sb.append(temp+"\r\n"); } } catch (IOException e) { e.printStackTrace(); } return sb.toString(); } /** * * @param 网页源码 * @param 正则表达式 * @return 超链接 */ public static String getMatcherSubstrs(String str,String regexStr,int i){ Pattern pattern=Pattern.compile(regexStr); Matcher matcher=pattern.matcher(str); String result = ""; if(matcher.find()){ result=matcher.group(i); } return result; } public static List<String> getMatcherSubstrs(String str,String regexStr){ Pattern pattern=Pattern.compile(regexStr); Matcher matcher=pattern.matcher(str); List<String> result = new ArrayList<String>(); while(matcher.find()){ if(matcher.group(1).endsWith(".html")&&!result.contains(matcher.group(1))) result.add(matcher.group(1)); } return result; } }爬去的文件有30多M,上传到百度云实在太慢了,我还是上传到csdn,有兴趣的朋友可以下,我是实在不想收积分钱,可是csdn上传必须设置积分,哎呀!