Java爬虫使用到的
原始获取网页内容类
/**
* 获取页面内容
*
* @return 返回获取的网页内容
* @throws IOException
*/
public static String getPageInfo(String urlStr, String Referer) throws IOException {
// 超时时间
int timeOut = 30 * 1000;
// 网页读取最大长度
long maxReaderLength = 1000 * 5000;
String htmlContent = "";
URL url = new URL(urlStr);
HttpURLConnection uc = (HttpURLConnection) url.openConnection();
uc.setConnectTimeout(timeOut);
uc.setDoOutput(true);
uc.setUseCaches(false);
uc.setRequestMethod("GET");
uc.setRequestProperty("Host", url.getHost());
uc.setRequestProperty("Keep-Alive", "300");
uc.setRequestProperty("Accept", "*/*");
uc.setRequestProperty("Referer", Referer);
uc.setRequestProperty("Accept-Encoding", "gzip,deflate,sdch");
uc.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
uc.setRequestProperty("Cache-Control", "max-age=0");
uc.setRequestProperty("Connection", "keep-alive");
uc.setRequestProperty("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64; compatible; MSIE 5.0; DigExt) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2194.2 Safari/537.36");
uc.setReadTimeout(timeOut);
if (uc.getResponseCode() == 302) {
throw new IOException("请求被跳转,断线重连");
}
// 取得字符集编码
String charSet = null;
String contentType = uc.getHeaderField("Content-Type");
if (contentType != null && !contentType.equals("")) {
String[] contentTypeArr = contentType.split(";");
for (int i = 0; i < contentTypeArr.length; i++) {
if (contentTypeArr[i].indexOf("charset=") != -1) {
String[] charSetArr = contentTypeArr[i].split("=");
charSet = charSetArr[1].trim();
break;
}
}
}
// 当字符集为GB2312时转成父集GBK
if (charSet != null) {
charSet = charSet.equalsIgnoreCase("GB2312") ? "GBK" : charSet;
} else {
charSet = "utf-8";
}
InputStream uis = null;
try {
uis = uc.getInputStream();
} catch (IOException e) {
htmlContent = "您访问的网页不存在";
}
InputStreamReader reader;
StringBuffer htmlStr = new StringBuffer();
// 检测返回的页面源码是否被压缩
try {
if (uc.getHeaderField("Content-Encoding") != null
&& uc.getHeaderField("Content-Encoding").equalsIgnoreCase("gzip"))
reader = new InputStreamReader(new GZIPInputStream(uis), charSet);
else
reader = new InputStreamReader(uis, charSet);
int readNum = 0;
while (true) {
int b = reader.read();
if (b == -1 || readNum > maxReaderLength)
break;
htmlStr.append((char) b);
readNum++;
}
reader.close();
} catch (Exception eex) {
eex.printStackTrace();
}
// 处理获取到的网页内容,将内容转换为一行字符串
htmlContent = htmlStr.toString().replace("\r\n", "").replace("\n", "").replace("\r", "")
.replaceAll("\\\\\\\\\\\\/", "/").replaceAll("\\\\\\\\\\\\\\\"", "\"").replaceAll("\\\\\"", "\"")
.replaceAll("\\\\/", "\\/");
return htmlContent;
}
测试函数
public static void main(String[] args) {
try {
System.out.println(getPageInfo("http://www.runoob.com/python/att-dictionary-setdefault.html",
"http://www.runoob.com/python/att-dictionary-setdefault.html"));
} catch (IOException e) {
e.printStackTrace();
}
}
代理访问
获取网页内容部分修改
上面是直接根据本地网络来进行请求地址,接下来让我们进入下一个环节,如果使用代理进行访问,本博客目前使用的都得免费的代理所以没有涉及到账号密码部分,感兴趣的同学可以自己私下进行查找相关文档。
/**
* 获取页面内容
*
* @return 返回获取的网页内容
* @throws IOException
*/
public static String getPageInfo(String urlStr, String Referer, String ip, int port) throws IOException {
// 超时时间
int timeOut = 30 * 1000;
// 网页读取最大长度
long maxReaderLength = 1000 * 5000;
String htmlContent = "";
URL url = new URL(urlStr);
// 指定代理服务器地址和端口
InetSocketAddress isa = new InetSocketAddress(ip, port);
// // 创建代理
Proxy proxy = new Proxy(Proxy.Type.HTTP, isa);
HttpURLConnection uc = (HttpURLConnection) url.openConnection(proxy);
uc.setConnectTimeout(timeOut);
uc.setDoOutput(true);
uc.setUseCaches(false);
uc.setRequestMethod("GET");
uc.setRequestProperty("Host", url.getHost());
uc.setRequestProperty("Keep-Alive", "300");
uc.setRequestProperty("Accept", "*/*");
uc.setRequestProperty("Referer", Referer);
uc.setRequestProperty("Accept-Encoding", "gzip,deflate,sdch");
uc.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
uc.setRequestProperty("Cache-Control", "max-age=0");
uc.setRequestProperty("Connection", "keep-alive");
uc.setRequestProperty("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64; compatible; MSIE 5.0; DigExt) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2194.2 Safari/537.36");
uc.setReadTimeout(timeOut);
if (uc.getResponseCode() == 302) {
throw new IOException("请求被跳转,断线重连");
}
// 取得字符集编码
String charSet = null;
String contentType = uc.getHeaderField("Content-Type");
if (contentType != null && !contentType.equals("")) {
String[] contentTypeArr = contentType.split(";");
for (int i = 0; i < contentTypeArr.length; i++) {
if (contentTypeArr[i].indexOf("charset=") != -1) {
String[] charSetArr = contentTypeArr[i].split("=");
charSet = charSetArr[1].trim();
break;
}
}
}
// 当字符集为GB2312时转成父集GBK
if (charSet != null) {
charSet = charSet.equalsIgnoreCase("GB2312") ? "GBK" : charSet;
} else {
charSet = "utf-8";
}
InputStream uis = null;
try {
uis = uc.getInputStream();
} catch (IOException e) {
htmlContent = "您访问的网页不存在";
}
InputStreamReader reader;
StringBuffer htmlStr = new StringBuffer();
// 检测返回的页面源码是否被压缩
try {
if (uc.getHeaderField("Content-Encoding") != null
&& uc.getHeaderField("Content-Encoding").equalsIgnoreCase("gzip"))
reader = new InputStreamReader(new GZIPInputStream(uis), charSet);
else
reader = new InputStreamReader(uis, charSet);
int readNum = 0;
while (true) {
int b = reader.read();
if (b == -1 || readNum > maxReaderLength)
break;
htmlStr.append((char) b);
readNum++;
}
reader.close();
} catch (Exception eex) {
eex.printStackTrace();
}
// 处理获取到的网页内容,将内容转换为一行字符串
htmlContent = htmlStr.toString().replace("\r\n", "").replace("\n", "").replace("\r", "")
.replaceAll("\\\\\\\\\\\\/", "/").replaceAll("\\\\\\\\\\\\\\\"", "\"").replaceAll("\\\\\"", "\"")
.replaceAll("\\\\/", "\\/");
return htmlContent;
}
测试函数
public static void main(String[] args) {
try {
// 写博客时还可以用,如果测试时不能使用,也先别急,后面会给相关资源
String ip = "61.166.153.168";
int port = 8080;
System.out.println(getPageInfo("http://www.runoob.com/python/att-dictionary-setdefault.html",
"http://www.runoob.com/python/att-dictionary-setdefault.html", ip, port));
} catch (IOException e) {
e.printStackTrace();
}
}
免费代理网站:
以下排名很分先后,因为有好用不好用的区别,而且有的网站是个人在维护,各位使用的过程中还是尽量保持相对的频率对其进行访问,以便在可以拿到相关资源的情况下,也让原博主可以省心的继续做下去(附言:这些都是在Google中搜索出来的,百度上面前几页都是广告,所以也呼吁大家多用Google)。
小幻HTTP代理(T1)
https://ip.ihuan.me/?page=4ce63706
89免费代理IP(T1)
http://www.89ip.cn/
快代理(T2)
https://www.kuaidaili.com/free/inha/
IP海
http://www.iphai.com/free/wg
云代理
http://www.ip3366.net/
国外免费代理IP-无忧代理IP
http://www.data5u.com/free/gwgn/index.shtml