Java 使用免费代理攻略

Java 爬虫代理

Posted by BY tuo on January 24, 2019

Java爬虫使用到的

原始获取网页内容类

	/**
	 * 获取页面内容
	 * 
	 * @return 返回获取的网页内容
	 * @throws IOException
	 */
	public static String getPageInfo(String urlStr, String Referer) throws IOException {

		// 超时时间
		int timeOut = 30 * 1000;
		// 网页读取最大长度
		long maxReaderLength = 1000 * 5000;

		String htmlContent = "";
		URL url = new URL(urlStr);
		HttpURLConnection uc = (HttpURLConnection) url.openConnection();

		uc.setConnectTimeout(timeOut);
		uc.setDoOutput(true);
		uc.setUseCaches(false);
		uc.setRequestMethod("GET");
		uc.setRequestProperty("Host", url.getHost());
		uc.setRequestProperty("Keep-Alive", "300");
		uc.setRequestProperty("Accept", "*/*");
		uc.setRequestProperty("Referer", Referer);
		uc.setRequestProperty("Accept-Encoding", "gzip,deflate,sdch");
		uc.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
		uc.setRequestProperty("Cache-Control", "max-age=0");
		uc.setRequestProperty("Connection", "keep-alive");
		uc.setRequestProperty("User-Agent",
				"Mozilla/5.0 (Windows NT 6.3; WOW64; compatible; MSIE 5.0; DigExt) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2194.2 Safari/537.36");		
		uc.setReadTimeout(timeOut);

		if (uc.getResponseCode() == 302) {
			throw new IOException("请求被跳转,断线重连");
		}

		// 取得字符集编码
		String charSet = null;
		String contentType = uc.getHeaderField("Content-Type");
		if (contentType != null && !contentType.equals("")) {
			String[] contentTypeArr = contentType.split(";");
			for (int i = 0; i < contentTypeArr.length; i++) {
				if (contentTypeArr[i].indexOf("charset=") != -1) {
					String[] charSetArr = contentTypeArr[i].split("=");
					charSet = charSetArr[1].trim();
					break;
				}
			}
		}
		// 当字符集为GB2312时转成父集GBK
		if (charSet != null) {
			charSet = charSet.equalsIgnoreCase("GB2312") ? "GBK" : charSet;
		} else {
			charSet = "utf-8";
		}
		InputStream uis = null;
		try {
			uis = uc.getInputStream();
		} catch (IOException e) {
			htmlContent = "您访问的网页不存在";
		}
		InputStreamReader reader;
		StringBuffer htmlStr = new StringBuffer();
		// 检测返回的页面源码是否被压缩
		try {
			if (uc.getHeaderField("Content-Encoding") != null
					&& uc.getHeaderField("Content-Encoding").equalsIgnoreCase("gzip"))
				reader = new InputStreamReader(new GZIPInputStream(uis), charSet);
			else
				reader = new InputStreamReader(uis, charSet);
			int readNum = 0;
			while (true) {
				int b = reader.read();
				if (b == -1 || readNum > maxReaderLength)
					break;
				htmlStr.append((char) b);
				readNum++;
			}
			reader.close();

		} catch (Exception eex) {
			eex.printStackTrace();
		}
		// 处理获取到的网页内容,将内容转换为一行字符串
		htmlContent = htmlStr.toString().replace("\r\n", "").replace("\n", "").replace("\r", "")
				.replaceAll("\\\\\\\\\\\\/", "/").replaceAll("\\\\\\\\\\\\\\\"", "\"").replaceAll("\\\\\"", "\"")
				.replaceAll("\\\\/", "\\/");
		return htmlContent;
	}

测试函数

	public static void main(String[] args) {
		try {
			System.out.println(getPageInfo("http://www.runoob.com/python/att-dictionary-setdefault.html",
					"http://www.runoob.com/python/att-dictionary-setdefault.html"));
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

代理访问

获取网页内容部分修改

上面是直接根据本地网络来进行请求地址,接下来让我们进入下一个环节,如果使用代理进行访问,本博客目前使用的都得免费的代理所以没有涉及到账号密码部分,感兴趣的同学可以自己私下进行查找相关文档。

/**
	 * 获取页面内容
	 * 
	 * @return 返回获取的网页内容
	 * @throws IOException
	 */
	public static String getPageInfo(String urlStr, String Referer, String ip, int port) throws IOException {

		// 超时时间
		int timeOut = 30 * 1000;
		// 网页读取最大长度
		long maxReaderLength = 1000 * 5000;

		String htmlContent = "";
		URL url = new URL(urlStr);

		// 指定代理服务器地址和端口
		InetSocketAddress isa = new InetSocketAddress(ip, port);
		// // 创建代理
		Proxy proxy = new Proxy(Proxy.Type.HTTP, isa);

		HttpURLConnection uc = (HttpURLConnection) url.openConnection(proxy);

		uc.setConnectTimeout(timeOut);
		uc.setDoOutput(true);
		uc.setUseCaches(false);
		uc.setRequestMethod("GET");
		uc.setRequestProperty("Host", url.getHost());
		uc.setRequestProperty("Keep-Alive", "300");
		uc.setRequestProperty("Accept", "*/*");
		uc.setRequestProperty("Referer", Referer);
		uc.setRequestProperty("Accept-Encoding", "gzip,deflate,sdch");
		uc.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
		uc.setRequestProperty("Cache-Control", "max-age=0");
		uc.setRequestProperty("Connection", "keep-alive");
		uc.setRequestProperty("User-Agent",
				"Mozilla/5.0 (Windows NT 6.3; WOW64; compatible; MSIE 5.0; DigExt) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2194.2 Safari/537.36");

		uc.setReadTimeout(timeOut);

		if (uc.getResponseCode() == 302) {
			throw new IOException("请求被跳转,断线重连");
		}

		// 取得字符集编码
		String charSet = null;
		String contentType = uc.getHeaderField("Content-Type");
		if (contentType != null && !contentType.equals("")) {
			String[] contentTypeArr = contentType.split(";");
			for (int i = 0; i < contentTypeArr.length; i++) {
				if (contentTypeArr[i].indexOf("charset=") != -1) {
					String[] charSetArr = contentTypeArr[i].split("=");
					charSet = charSetArr[1].trim();
					break;
				}
			}
		}
		// 当字符集为GB2312时转成父集GBK
		if (charSet != null) {
			charSet = charSet.equalsIgnoreCase("GB2312") ? "GBK" : charSet;
		} else {
			charSet = "utf-8";
		}
		InputStream uis = null;
		try {
			uis = uc.getInputStream();
		} catch (IOException e) {
			htmlContent = "您访问的网页不存在";
		}
		InputStreamReader reader;
		StringBuffer htmlStr = new StringBuffer();
		// 检测返回的页面源码是否被压缩
		try {
			if (uc.getHeaderField("Content-Encoding") != null
					&& uc.getHeaderField("Content-Encoding").equalsIgnoreCase("gzip"))
				reader = new InputStreamReader(new GZIPInputStream(uis), charSet);
			else
				reader = new InputStreamReader(uis, charSet);
			int readNum = 0;
			while (true) {
				int b = reader.read();
				if (b == -1 || readNum > maxReaderLength)
					break;
				htmlStr.append((char) b);
				readNum++;
			}
			reader.close();

		} catch (Exception eex) {
			eex.printStackTrace();
		}
		// 处理获取到的网页内容,将内容转换为一行字符串
		htmlContent = htmlStr.toString().replace("\r\n", "").replace("\n", "").replace("\r", "")
				.replaceAll("\\\\\\\\\\\\/", "/").replaceAll("\\\\\\\\\\\\\\\"", "\"").replaceAll("\\\\\"", "\"")
				.replaceAll("\\\\/", "\\/");
		return htmlContent;
	}

测试函数

	public static void main(String[] args) {
		try {
			// 写博客时还可以用,如果测试时不能使用,也先别急,后面会给相关资源
			String ip = "61.166.153.168";
			int port = 8080;
			System.out.println(getPageInfo("http://www.runoob.com/python/att-dictionary-setdefault.html",
					"http://www.runoob.com/python/att-dictionary-setdefault.html", ip, port));
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

免费代理网站:

以下排名很分先后,因为有好用不好用的区别,而且有的网站是个人在维护,各位使用的过程中还是尽量保持相对的频率对其进行访问,以便在可以拿到相关资源的情况下,也让原博主可以省心的继续做下去(附言:这些都是在Google中搜索出来的,百度上面前几页都是广告,所以也呼吁大家多用Google)。

小幻HTTP代理(T1)

https://ip.ihuan.me/?page=4ce63706

89免费代理IP(T1)

http://www.89ip.cn/

快代理(T2)

https://www.kuaidaili.com/free/inha/

IP海

http://www.iphai.com/free/wg

云代理

http://www.ip3366.net/

国外免费代理IP-无忧代理IP

http://www.data5u.com/free/gwgn/index.shtml