网络爬虫爬取网络小说
——Java编程
当我们上网的时候,总是会出现一个网站。里面的内容都是自己喜欢的,但无奈确实无法下载的。粘贴复制的话,不仅耗费精力时间,效率也不高。现在本人在这里上传一个简单的爬取网络上的文字的例子。
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.Date;
public class GetHtml {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String url = \"http://www.avkez.info/article-show-id-157253.html\";
// take(url,\"上一篇\");
takeUp(url); takeDown(url);
//
System.out.println(sb);
}
static void takeUp(String Tem_url) throws IOException{
String tem = null;
tem = take(Tem_url,\"上一篇\");
if(tem!=null){
// System.out.println(tem);
takeUp(tem);
}
System.out.println(\"向上下载完成\");
}
static void takeDown(String Tem_url) throws IOException{
String tem = null;
tem = take(Tem_url,\"下一篇\");
if(tem!=null){
// System.out.println(tem);
takeDown(tem);
}
System.out.println(\"向下下载完成\");
}
static String take(String Tem_url,String tem){
String temp1 = null;
try{
URL url=new URL(Tem_url);//取得资源对象
URLConnection uc=url.openConnection();//生成连接对象
uc.setDoOutput(true);
uc.connect(); //发出连接
String temp;
// StringBuffer sb = new StringBuffer();
BufferedReader in = new BufferedReader(new InputStreamReader(
url.openStream(),\"UTF-8\"));
File f= null;
while ((temp = in.readLine()) != null) {
if(temp.contains(\"\\\"title\\\"\")){
while (!(temp = in.readLine()).contains(\"h2\")){
}
temp=temp.replace(\"
.replace(\"/\", \"\").replace(\"?\", \"\");
System.out.println(Tem_url);
System.out.println(temp);
f=new File(\"D:/宅男客栈\",temp + \".txt\");
}
if(temp.contains(\"内容页(1)\")){
// FileWriter w = new FileWriter(f);
BufferedWriter bwrite = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f),\"utf-8\"));
// sb.append(\"\\n\");
while (!(temp = in.readLine()).contains(\"内容页(1)\")) {
}
while (!(temp = in.readLine()).contains(\"内容页(2)\")) {
temp=temp.replace(\"
\", \"\")
.replace(\"
\", \"\")
.replace(\"
\", \"\");bwrite.write(temp);
bwrite.flush();
bwrite.newLine();
}
bwrite.close();
System.gc();
}
if(temp.contains(tem)){
if(temp.contains(\"暂无数据\")){
break;
}
String[] str=temp.split(\"\\\"\");
temp1 = \"http://www.avkez.info\"+str[1];
}
}
in.close();
System.gc();
}
catch(IOException e){
e.printStackTrace();
File temf =new File(\"D:/宅男客栈/错误日志\");
if(!(temf.exists())){
temf.mkdirs();
}
File fmis = new File(\"D:/宅男客栈/错误日志\",\"错误日志.txt\");
try {
BufferedWriter bwrite = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fmis,true),\"utf-8\"));
bwrite.write(new Date().toString());
bwrite.newLine();
bwrite.write(\" \"+e.getMessage());
bwrite.newLine();
bwrite.write(\" \"+Tem_url);
bwrite.newLine();
bwrite.flush();
bwrite.close();
System.gc();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
return temp1;
}
}
当然这只是一个简单的示例,适合接触编程没有多久的同学。如果有兴趣,可以把这个程序进行改进,例如使用多线程来提高下载的效率。
至于爬取的内容,是“刘备”。所幸已经失效了。但思路是没有问题的。有html基础的同学,看起来会更简单一些。
因篇幅问题不能全部显示,请点此查看更多更全内容