package jklz;

import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.IOException;
import java.util.*;

/**
* @Author: JavaPub
* @License: https://github.com/Rodert/ https://gitee.com/rodert/
* @Contact:
* @Date: 2022/5/25 16:41
* @Version: 1.0
* @Description:
*/

public class Q {
public static void main(String[] args) throws IOException {
Set<String> objects = new HashSet<>();
long count = 0;
List<String> urlList = FileUtils.readLines(new File("C:\\Users\\wangshiyu\\Desktop\\url.txt"), "utf8");
for (String url : urlList) {
try {
int strStartIndex = url.indexOf("http");
int strEndIndex = url.indexOf("/", strStartIndex + 9);
String substring = url.substring(strStartIndex, strEndIndex).substring("http".length());
// System.out.println("http" + substring);
count++;
objects.add("http" + substring);
} catch (Exception e) {
// System.out.println(url);
}
}
System.out.println(count);
System.out.println(objects.size());
FileUtils.writeLines(new File("C:\\Users\\wangshiyu\\Desktop\\list.txt"), objects);
}
}

package jklz;

import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* @Author: JavaPub
* @License: https://github.com/Rodert/ https://gitee.com/rodert/
* @Contact:
* @Date: 2022/5/25 15:51
* @Version: 1.0
* @Description:
*/

public class DisHttpUrl {
private static final String url = "http://europa.eu.int/eur-lex/lex/LexUriServ/site/en/oj/2005/l_069/l_06920050316en00590063.pdf";

private static final String RE_TOP = "(\\w*\\.?){1}\\.(com.cn|net.cn|gov.cn|org\\.nz|org.cn|com|net|org|gov|cc|biz|info|cn|co)$";

private static final String RE_TOP2 = "(?<=http://|\\.)[^.]*?\\.(ps|ba|gi|qa|sk|ar|is|rs|am|sy|ve|energy|pa|hu|vg|ky|gg|do|gl|in|ee|pl|gr|ie|no|de|uy|kz|pt|bg|zm|md|ro|vn|ly|cu|th|fi|dk|lv|by|at|edu|ae|nl|sd|fi|ua|se|mt|ch|lu|id|kr|it|es|mx|fr|mc|be|si|us|hk|ir|io|or.kr|gob.cu|ru|jp|eu|uk|ca|int|iq|eu.int|com.cn|net.cn|gov.cn|org\\.nz|org.cn|com|net|org|gov|cc|biz|info|cn|co)";
private static final String RE_TOP2_https = "(?<=https://|\\.)[^.]*?\\.(ee|pl|gr|ie|no|de|uy|kz|pt|bg|zm|md|ro|vn|ly|cu|th|fi|dk|lv|by|at|edu|ae|nl|sd|fi|ua|se|mt|ch|lu|id|kr|it|es|mx|fr|mc|be|si|us|hk|ir|io|or.kr|gob.cu|ru|jp|eu|uk|ca|int|iq|eu.int|com.cn|net.cn|gov.cn|org\\.nz|org.cn|com|net|org|gov|cc|biz|info|cn|co)";

public static void main(String[] args) throws IOException {
Set<String> objects = new HashSet<>();
Pattern p = Pattern.compile(RE_TOP2, Pattern.CASE_INSENSITIVE);
Pattern p2 = Pattern.compile(RE_TOP2_https, Pattern.CASE_INSENSITIVE);
List<String> urlList = FileUtils.readLines(new File("C:\\Users\\wangshiyu\\Desktop\\url.txt"), "utf8");
long count = 0;
for (String s : urlList) {
if (s.contains("http")) {
Matcher matcher = p.matcher(s);
if (matcher.find()) {
String group = matcher.group();
objects.add("http://" + group);
// System.out.println(group);
// System.out.println("第:" + count++);
} else {
Matcher matcher2 = p2.matcher(s);
if (matcher2.find()) {
String group = matcher2.group();
objects.add("https://" + group);
// System.out.println("https://" + group);
// System.out.println("第:" + count++);
} else {
System.out.println("##### " + s);
}
}
}
}

}
}