当前位置: 编程技术>移动开发
本页文章导读:
▪经过JAVA获取优酷、土豆、酷6、6间房等视频 通过JAVA获取优酷、土豆、酷6、6间房等视频
通过JAVA获取优酷、土豆、酷6、6间房、56视频,现在很多社会网站都有这个功能,用户输入优酷、土豆、酷6、6间房、56视频地址后,能找到对应.........
▪ 视频地址发掘抓取 视频地址挖掘抓取
//得到视频标题 public String getVideoTitle(String beginTitleStr,int beginTextNum,String endTitleStr){ int beginTitleNum = videoStr.indexOf(beginTitleStr,beginTextNum)+beginTitleStr.length(); .........
▪ 简略好用的网络爬虫spider/crawler 简单好用的网络爬虫spider/crawler
package spider; import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.HashS.........
[1]经过JAVA获取优酷、土豆、酷6、6间房等视频
来源: 互联网 发布时间: 2014-02-18
通过JAVA获取优酷、土豆、酷6、6间房等视频
通过JAVA获取优酷、土豆、酷6、6间房、56视频,现在很多社会网站都有这个功能,用户输入优酷、土豆、酷6、6间房、56视频地址后,能找到对应的视频及视频的缩略图,有些社区网站还能获取到视频的时长。
比如:新浪微博就有这个功能,当用户输入优酷、土豆、酷6、6间房、56视频网址后,就能获取到相应的视频地址及视频的缩略图。
其中要用到一个JAR包,链接地址如下:
http://jsoup.org/packages/jsoup-1.5.2.jar
原文地址:http://www.juziku.com/wiki/906.htm
核心代码
view sourceprint?
001 import org.jsoup.Jsoup;
002 import org.jsoup.nodes.Document;
003 import org.jsoup.nodes.Element;
004 import org.jsoup.select.Elements;
005
006 /**
007 * 视频工具类
008 * @author sunlightcs
009 * 2011-4-6
010 * http://hi.juziku.com/sunlightcs/
011 */
012 public class VideoUtil {
013
014 /**
015 * 获取视频信息
016 * @param url
017 * @return
018 */
019 public static Video getVideoInfo(String url){
020 Video video = new Video();
021
022 if(url.indexOf("v.youku.com")!=-1){
023 try {
024 video = getYouKuVideo(url);
025 } catch (Exception e) {
026 video = null;
027 }
028 }else if(url.indexOf("tudou.com")!=-1){
029 try {
030 video = getTudouVideo(url);
031 } catch (Exception e) {
032 video = null;
033 }
034 }else if(url.indexOf("v.ku6.com")!=-1){
035 try {
036 video = getKu6Video(url);
037 } catch (Exception e) {
038 video = null;
039 }
040 }else if(url.indexOf("6.cn")!=-1){
041 try {
042 video = get6Video(url);
043 } catch (Exception e) {
044 video = null;
045 }
046 }else if(url.indexOf("56.com")!=-1){
047 try {
048 video = get56Video(url);
049 } catch (Exception e) {
050 video = null;
051 }
052 }
053
054 return video;
055 }
056
057
058 /**
059 * 获取优酷视频
060 * @param url 视频URL
061 */
062 public static Video getYouKuVideo(String url) throws Exception{
063 Document doc = getURLContent(url);
064
065 /**
066 *获取视频缩略图
067 */
068 String pic = getElementAttrById(doc, "s_sina", "href");
069 int local = pic.indexOf("pic=");
070 pic = pic.substring(local+4);
071
072 /**
073 * 获取视频地址
074 */
075 String flash = getElementAttrById(doc, "link2", "value");
076
077 /**
078 * 获取视频时间
079 */
080 String time = getElementAttrById(doc, "download", "href");
081 String []arrays = time.split("\\|");
082 time = arrays[4];
083
084 Video video = new Video();
085 video.setPic(pic);
086 video.setFlash(flash);
087 video.setTime(time);
088
089 return video;
090 }
091
092
093 /**
094 * 获取土豆视频
095 * @param url 视频URL
096 */
097 public static Video getTudouVideo(String url) throws Exception{
098 Document doc = getURLContent(url);
099 String content = doc.html();
100 int beginLocal = content.indexOf("<script>document.domain");
101 int endLocal = content.indexOf("</script>");
102 content = content.substring(beginLocal, endLocal);
103
104 /**
105 * 获取视频地址
106 */
107 String flash = getScriptVarByName("iid_code", content);
108 flash = "http://www.tudou.com/v/" + flash + "/v.swf";
109
110 /**
111 *获取视频缩略图
112 */
113 String pic = getScriptVarByName("thumbnail", content);
114
115 /**
116 * 获取视频时间
117 */
118 String time = getScriptVarByName("time", content);
119
120 Video video = new Video();
121 video.setPic(pic);
122 video.setFlash(flash);
123 video.setTime(time);
124
125 return video;
126 }
127
128
129 /**
130 * 获取酷6视频
131 * @param url 视频URL
132 */
133 public static Video getKu6Video(String url) throws Exception{
134 Document doc = getURLContent(url);
135
136 /**
137 * 获取视频地址
138 */
139 Element flashEt = doc.getElementById("outSideSwfCode");
140 String flash = flashEt.attr("value");
141
142 /**
143 * 获取视频缩略图
144 */
145 Element picEt = doc.getElementById("plVideosList");
146 String time = null;
147 String pic = null;
148 if(picEt!=null){
149 Elements pics = picEt.getElementsByTag("img");
150 pic = pics.get(0).attr("src");
151
152 /**
153 * 获取视频时长
154 */
155 Element timeEt = picEt.select("span.review>cite").first();
156 time = timeEt.text();
157 }else{
158 pic = doc.getElementsByClass("s_pic").first().text();
159 }
160
161 Video video = new Video();
162 video.setPic(pic);
163 video.setFlash(flash);
164 video.setTime(time);
165
166 return video;
167
168 }
169
170
171 /**
172 * 获取6间房视频
173 * @param url 视频URL
174 */
175 public static Video get6Video(String url) throws Exception{
176 Document doc = getURLContent(url);
177
178 /**
179 * 获取视频缩略图
180 */
181 Element picEt = doc.getElementsByClass("summary").first();
182 String pic = picEt.getElementsByTag("img").first().attr("src");
183
184 /**
185 * 获取视频时长
186 */
187 String time = getVideoTime(doc, url, "watchUserVideo");
188 if(time==null){
189 time = getVideoTime(doc, url, "watchRelVideo");
190 }
191
192 /**
193 * 获取视频地址
194 */
195 Element flashEt = doc.getElementById("video-share-code");
196 doc = Jsoup.parse(flashEt.attr("value"));
197 String flash = doc.select("embed").attr("src");
198
199 Video video = new Video();
200 video.setPic(pic);
201 video.setFlash(flash);
202 video.setTime(time);
203
204 return video;
205 }
206
207
208 /**
209 * 获取56视频
210 * @param url 视频URL
211 */
212 public static Video get56Video(String url) throws Exception{
213 Document doc = getURLContent(url);
214 String content = doc.html();
215
216 /**
217 * 获取视频缩略图
218 */
219 int begin = content.indexOf("\"img\":\"");
220 content = content.substring(begin+7, begin+200);
221 int end = content.indexOf("\"};");
222 String pic = content.substring(0, end).trim();
223 pic = pic.replaceAll("\\\\", "");
224
225 /**
226 * 获取视频地址
227 */
228 String flash = "http://player.56.com" + url.substring(url.lastIndexOf("/"), url.lastIndexOf(".html")) + ".swf";
229
230 Video video = new Video();
231 video.setPic(pic);
232 video.setFlash(flash);
233
234 return video;
235 }
236
237 /**
238 * 获取6间房视频时长
239 */
240 private static String getVideoTime(Document doc, String url, String id) {
241 String time = null;
242
243 Element timeEt = doc.getElementById(id);
244 Elements links = timeEt.select("dt > a");
245
246
247 for (Element link : links) {
248 String linkHref = link.attr("href");
249 if(linkHref.equalsIgnoreCase(url)){
250 time = link.parent().getElementsByTag("em").first().text();
251 break;
252 }
253 }
254 return time;
255 }
256
257
258 /**
259 * 获取script某个变量的值
260 * @param name 变量名称
261 * @return 返回获取的值
262 */
263 private static String getScriptVarByName(String name, String content){
264 String script = content;
265
266 int begin = script.indexOf(name);
267
268 script = script.substring(begin+name.length()+2);
269
270 int end = script.indexOf(",");
271
272 script = script.substring(0,end);
273
274 String result=script.replaceAll("'", "");
275 result = result.trim();
276
277 return result;
278 }
279
280
281 /**
282 * 根据HTML的ID键及属于名,获取属于值
283 * @param id HTML的ID键
284 * @param attrName 属于名
285 * @return 返回属性值
286 */
287 private static String getElementAttrById(Document doc, String id, String attrName)throws Exception{
288 Element et = doc.getElementById(id);
289 String attrValue = et.attr(attrName);
290
291 return attrValue;
292 }
293
294
295
296 /**
297 * 获取网页的内容
298 */
299 private static Document getURLContent(String url) throws Exception{
300 Document doc = Jsoup.connect(url)
301 .data("query", "Java")
302 .userAgent("Mozilla")
303 .cookie("auth", "token")
304 .timeout(6000)
305 .post();
306 return doc;
307 }
308
309
310 public static void main(String[] args) {
311 //String url = "http://v.youku.com/v_show/id_XMjU0MjI2NzY0.html";
312 //String url = "http://www.tudou.com/programs/view/pVploWOtCQM/";
313 //String url = "http://v.ku6.com/special/show_4024167/9t7p64bisV2A31Hz.html";
314 //String url = "http://v.ku6.com/show/BpP5LeyVwvikbT1F.html";
315 //String url = "http://6.cn/watch/14757577.html";
316 String url = "http://www.56.com/u64/v_NTkzMDEzMTc.html";
317 Video video = getVideoInfo(url);
318 System.out.println("视频缩略图:"+video.getPic());
319 System.out.println("视频地址:"+video.getFlash());
320 System.out.println("视频时长:"+video.getTime());
321 }
322 }
[代码] 视频封装类
view sourceprint?01 /**
02 * 视频封装
03 * @author sunlightcs
04 * 2011-4-6
05 * http://hi.juziku.com/sunlightcs/
06 */
07 public class Video {
08 private String flash;
09 private String pic;
10 private String time;
11 public String getFlash() {
12 return flash;
13 }
14 public void setFlash(String flash) {
15 this.flash = flash;
16 }
17 public String getPic() {
18 return pic;
19 }
20 public void setPic(String pic) {
21 this.pic = pic;
22 }
23 public String getTime() {
24 return time;
25 }
26 public void setTime(String time) {
27 this.time = time;
28 }
29 }
通过JAVA获取优酷、土豆、酷6、6间房、56视频,现在很多社会网站都有这个功能,用户输入优酷、土豆、酷6、6间房、56视频地址后,能找到对应的视频及视频的缩略图,有些社区网站还能获取到视频的时长。
比如:新浪微博就有这个功能,当用户输入优酷、土豆、酷6、6间房、56视频网址后,就能获取到相应的视频地址及视频的缩略图。
其中要用到一个JAR包,链接地址如下:
http://jsoup.org/packages/jsoup-1.5.2.jar
原文地址:http://www.juziku.com/wiki/906.htm
核心代码
view sourceprint?
001 import org.jsoup.Jsoup;
002 import org.jsoup.nodes.Document;
003 import org.jsoup.nodes.Element;
004 import org.jsoup.select.Elements;
005
006 /**
007 * 视频工具类
008 * @author sunlightcs
009 * 2011-4-6
010 * http://hi.juziku.com/sunlightcs/
011 */
012 public class VideoUtil {
013
014 /**
015 * 获取视频信息
016 * @param url
017 * @return
018 */
019 public static Video getVideoInfo(String url){
020 Video video = new Video();
021
022 if(url.indexOf("v.youku.com")!=-1){
023 try {
024 video = getYouKuVideo(url);
025 } catch (Exception e) {
026 video = null;
027 }
028 }else if(url.indexOf("tudou.com")!=-1){
029 try {
030 video = getTudouVideo(url);
031 } catch (Exception e) {
032 video = null;
033 }
034 }else if(url.indexOf("v.ku6.com")!=-1){
035 try {
036 video = getKu6Video(url);
037 } catch (Exception e) {
038 video = null;
039 }
040 }else if(url.indexOf("6.cn")!=-1){
041 try {
042 video = get6Video(url);
043 } catch (Exception e) {
044 video = null;
045 }
046 }else if(url.indexOf("56.com")!=-1){
047 try {
048 video = get56Video(url);
049 } catch (Exception e) {
050 video = null;
051 }
052 }
053
054 return video;
055 }
056
057
058 /**
059 * 获取优酷视频
060 * @param url 视频URL
061 */
062 public static Video getYouKuVideo(String url) throws Exception{
063 Document doc = getURLContent(url);
064
065 /**
066 *获取视频缩略图
067 */
068 String pic = getElementAttrById(doc, "s_sina", "href");
069 int local = pic.indexOf("pic=");
070 pic = pic.substring(local+4);
071
072 /**
073 * 获取视频地址
074 */
075 String flash = getElementAttrById(doc, "link2", "value");
076
077 /**
078 * 获取视频时间
079 */
080 String time = getElementAttrById(doc, "download", "href");
081 String []arrays = time.split("\\|");
082 time = arrays[4];
083
084 Video video = new Video();
085 video.setPic(pic);
086 video.setFlash(flash);
087 video.setTime(time);
088
089 return video;
090 }
091
092
093 /**
094 * 获取土豆视频
095 * @param url 视频URL
096 */
097 public static Video getTudouVideo(String url) throws Exception{
098 Document doc = getURLContent(url);
099 String content = doc.html();
100 int beginLocal = content.indexOf("<script>document.domain");
101 int endLocal = content.indexOf("</script>");
102 content = content.substring(beginLocal, endLocal);
103
104 /**
105 * 获取视频地址
106 */
107 String flash = getScriptVarByName("iid_code", content);
108 flash = "http://www.tudou.com/v/" + flash + "/v.swf";
109
110 /**
111 *获取视频缩略图
112 */
113 String pic = getScriptVarByName("thumbnail", content);
114
115 /**
116 * 获取视频时间
117 */
118 String time = getScriptVarByName("time", content);
119
120 Video video = new Video();
121 video.setPic(pic);
122 video.setFlash(flash);
123 video.setTime(time);
124
125 return video;
126 }
127
128
129 /**
130 * 获取酷6视频
131 * @param url 视频URL
132 */
133 public static Video getKu6Video(String url) throws Exception{
134 Document doc = getURLContent(url);
135
136 /**
137 * 获取视频地址
138 */
139 Element flashEt = doc.getElementById("outSideSwfCode");
140 String flash = flashEt.attr("value");
141
142 /**
143 * 获取视频缩略图
144 */
145 Element picEt = doc.getElementById("plVideosList");
146 String time = null;
147 String pic = null;
148 if(picEt!=null){
149 Elements pics = picEt.getElementsByTag("img");
150 pic = pics.get(0).attr("src");
151
152 /**
153 * 获取视频时长
154 */
155 Element timeEt = picEt.select("span.review>cite").first();
156 time = timeEt.text();
157 }else{
158 pic = doc.getElementsByClass("s_pic").first().text();
159 }
160
161 Video video = new Video();
162 video.setPic(pic);
163 video.setFlash(flash);
164 video.setTime(time);
165
166 return video;
167
168 }
169
170
171 /**
172 * 获取6间房视频
173 * @param url 视频URL
174 */
175 public static Video get6Video(String url) throws Exception{
176 Document doc = getURLContent(url);
177
178 /**
179 * 获取视频缩略图
180 */
181 Element picEt = doc.getElementsByClass("summary").first();
182 String pic = picEt.getElementsByTag("img").first().attr("src");
183
184 /**
185 * 获取视频时长
186 */
187 String time = getVideoTime(doc, url, "watchUserVideo");
188 if(time==null){
189 time = getVideoTime(doc, url, "watchRelVideo");
190 }
191
192 /**
193 * 获取视频地址
194 */
195 Element flashEt = doc.getElementById("video-share-code");
196 doc = Jsoup.parse(flashEt.attr("value"));
197 String flash = doc.select("embed").attr("src");
198
199 Video video = new Video();
200 video.setPic(pic);
201 video.setFlash(flash);
202 video.setTime(time);
203
204 return video;
205 }
206
207
208 /**
209 * 获取56视频
210 * @param url 视频URL
211 */
212 public static Video get56Video(String url) throws Exception{
213 Document doc = getURLContent(url);
214 String content = doc.html();
215
216 /**
217 * 获取视频缩略图
218 */
219 int begin = content.indexOf("\"img\":\"");
220 content = content.substring(begin+7, begin+200);
221 int end = content.indexOf("\"};");
222 String pic = content.substring(0, end).trim();
223 pic = pic.replaceAll("\\\\", "");
224
225 /**
226 * 获取视频地址
227 */
228 String flash = "http://player.56.com" + url.substring(url.lastIndexOf("/"), url.lastIndexOf(".html")) + ".swf";
229
230 Video video = new Video();
231 video.setPic(pic);
232 video.setFlash(flash);
233
234 return video;
235 }
236
237 /**
238 * 获取6间房视频时长
239 */
240 private static String getVideoTime(Document doc, String url, String id) {
241 String time = null;
242
243 Element timeEt = doc.getElementById(id);
244 Elements links = timeEt.select("dt > a");
245
246
247 for (Element link : links) {
248 String linkHref = link.attr("href");
249 if(linkHref.equalsIgnoreCase(url)){
250 time = link.parent().getElementsByTag("em").first().text();
251 break;
252 }
253 }
254 return time;
255 }
256
257
258 /**
259 * 获取script某个变量的值
260 * @param name 变量名称
261 * @return 返回获取的值
262 */
263 private static String getScriptVarByName(String name, String content){
264 String script = content;
265
266 int begin = script.indexOf(name);
267
268 script = script.substring(begin+name.length()+2);
269
270 int end = script.indexOf(",");
271
272 script = script.substring(0,end);
273
274 String result=script.replaceAll("'", "");
275 result = result.trim();
276
277 return result;
278 }
279
280
281 /**
282 * 根据HTML的ID键及属于名,获取属于值
283 * @param id HTML的ID键
284 * @param attrName 属于名
285 * @return 返回属性值
286 */
287 private static String getElementAttrById(Document doc, String id, String attrName)throws Exception{
288 Element et = doc.getElementById(id);
289 String attrValue = et.attr(attrName);
290
291 return attrValue;
292 }
293
294
295
296 /**
297 * 获取网页的内容
298 */
299 private static Document getURLContent(String url) throws Exception{
300 Document doc = Jsoup.connect(url)
301 .data("query", "Java")
302 .userAgent("Mozilla")
303 .cookie("auth", "token")
304 .timeout(6000)
305 .post();
306 return doc;
307 }
308
309
310 public static void main(String[] args) {
311 //String url = "http://v.youku.com/v_show/id_XMjU0MjI2NzY0.html";
312 //String url = "http://www.tudou.com/programs/view/pVploWOtCQM/";
313 //String url = "http://v.ku6.com/special/show_4024167/9t7p64bisV2A31Hz.html";
314 //String url = "http://v.ku6.com/show/BpP5LeyVwvikbT1F.html";
315 //String url = "http://6.cn/watch/14757577.html";
316 String url = "http://www.56.com/u64/v_NTkzMDEzMTc.html";
317 Video video = getVideoInfo(url);
318 System.out.println("视频缩略图:"+video.getPic());
319 System.out.println("视频地址:"+video.getFlash());
320 System.out.println("视频时长:"+video.getTime());
321 }
322 }
[代码] 视频封装类
view sourceprint?01 /**
02 * 视频封装
03 * @author sunlightcs
04 * 2011-4-6
05 * http://hi.juziku.com/sunlightcs/
06 */
07 public class Video {
08 private String flash;
09 private String pic;
10 private String time;
11 public String getFlash() {
12 return flash;
13 }
14 public void setFlash(String flash) {
15 this.flash = flash;
16 }
17 public String getPic() {
18 return pic;
19 }
20 public void setPic(String pic) {
21 this.pic = pic;
22 }
23 public String getTime() {
24 return time;
25 }
26 public void setTime(String time) {
27 this.time = time;
28 }
29 }
[2] 视频地址发掘抓取
来源: 互联网 发布时间: 2014-02-18
视频地址挖掘抓取
//得到视频标题
public String getVideoTitle(String beginTitleStr,int beginTextNum,String endTitleStr){
int beginTitleNum = videoStr.indexOf(beginTitleStr,beginTextNum)+beginTitleStr.length();
int endTitleNum = videoStr.indexOf(endTitleStr,beginTitleNum);
String videoTitle = new ToolsSubString().mySubString(videoStr,beginTitleNum,endTitleNum);
return videoTitle;
}
//得到视频地址ID
public String getVideoId(String beginIdStr,int beginTextNum,String endIdStr){
int beginIdNum = videoStr.indexOf(beginIdStr,beginTextNum)+beginIdStr.length();
int endIdNum = videoStr.indexOf(endIdStr,beginIdNum);
String videoId = new ToolsSubString().mySubString(videoStr,beginIdNum,endIdNum);
return videoId;
}
//抓取土豆网视频信息的方法
public void catchTudouVideo(int neadCatchNum,String beginSearchVideoNumStr){
//抓取内容定位
String endSearchVideoNumStr = "</em>";
String beginTextStr = "<a inner\" target=\"new";
String beginIdStr = "href=/index.html"http://www.tudou.com/programs/view/";
String endIdStr = "/\"";
String beginTitleStr = "title=\"";
String endTitleStr = "\"";
int beginTextNum = 0;
//得到搜索到视频个数,循环得到视频信息
int searchVideoNum = getVideoNum(neadCatchNum,beginSearchVideoNumStr, endSearchVideoNumStr);
for(int i=0;i<searchVideoNum;i++){
//内容定位
beginTextNum = videoStr.indexOf(beginTextStr,beginTextNum)+beginTextStr.length();
//得到视频信息
String videoTitle = getVideoTitle(beginTitleStr, beginTextNum, endTitleStr);
String videoId = getVideoId(beginIdStr, beginTextNum, endIdStr);
String videoUrl = "http://www.tudou.com/v/"+videoId+"/v.swf";
//创建video对象保存视频信息,并添加到video集合,为存入数据库做准备
VideoEntity video = new VideoEntity(videoTitle,"土豆网",videoUrl);
videoList.add(video);
}
}
//得到视频标题
public String getVideoTitle(String beginTitleStr,int beginTextNum,String endTitleStr){
int beginTitleNum = videoStr.indexOf(beginTitleStr,beginTextNum)+beginTitleStr.length();
int endTitleNum = videoStr.indexOf(endTitleStr,beginTitleNum);
String videoTitle = new ToolsSubString().mySubString(videoStr,beginTitleNum,endTitleNum);
return videoTitle;
}
//得到视频地址ID
public String getVideoId(String beginIdStr,int beginTextNum,String endIdStr){
int beginIdNum = videoStr.indexOf(beginIdStr,beginTextNum)+beginIdStr.length();
int endIdNum = videoStr.indexOf(endIdStr,beginIdNum);
String videoId = new ToolsSubString().mySubString(videoStr,beginIdNum,endIdNum);
return videoId;
}
//抓取土豆网视频信息的方法
public void catchTudouVideo(int neadCatchNum,String beginSearchVideoNumStr){
//抓取内容定位
String endSearchVideoNumStr = "</em>";
String beginTextStr = "<a inner\" target=\"new";
String beginIdStr = "href=/index.html"http://www.tudou.com/programs/view/";
String endIdStr = "/\"";
String beginTitleStr = "title=\"";
String endTitleStr = "\"";
int beginTextNum = 0;
//得到搜索到视频个数,循环得到视频信息
int searchVideoNum = getVideoNum(neadCatchNum,beginSearchVideoNumStr, endSearchVideoNumStr);
for(int i=0;i<searchVideoNum;i++){
//内容定位
beginTextNum = videoStr.indexOf(beginTextStr,beginTextNum)+beginTextStr.length();
//得到视频信息
String videoTitle = getVideoTitle(beginTitleStr, beginTextNum, endTitleStr);
String videoId = getVideoId(beginIdStr, beginTextNum, endIdStr);
String videoUrl = "http://www.tudou.com/v/"+videoId+"/v.swf";
//创建video对象保存视频信息,并添加到video集合,为存入数据库做准备
VideoEntity video = new VideoEntity(videoTitle,"土豆网",videoUrl);
videoList.add(video);
}
}
[3] 简略好用的网络爬虫spider/crawler
来源: 互联网 发布时间: 2014-02-18
简单好用的网络爬虫spider/crawler
package spider;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SearchCrawler implements Runnable{
/*
* disallowListCache缓存robot不允许搜索的URL。 Robot协议在Web站点的根目录下设置一个robots.txt文件,
* 规定站点上的哪些页面是限制搜索的。 搜索程序应该在搜索过程中跳过这些区域,下面是robots.txt的一个例子: # robots.txt for
* http://somehost.com/ User-agent: Disallow: /cgi-bin/ Disallow:
* /registration # Disallow robots on registration page Disallow: /login
*/
private HashMap<String, ArrayList<String>> disallowListCache = new HashMap<String, ArrayList<String>>();
ArrayList<String> errorList = new ArrayList<String>();// 错误信息
ArrayList<String> result = new ArrayList<String>(); // 搜索到的结果
String startUrl;// 开始搜索的起点
int maxUrl;// 最大处理的url数
String searchString;// 要搜索的字符串(英文)
boolean caseSensitive = false;// 是否区分大小写
boolean limitHost = false;// 是否在限制的主机内搜索
public SearchCrawler(String startUrl, int maxUrl, String searchString) {
this.startUrl = startUrl;
this.maxUrl = maxUrl;
this.searchString = searchString;
}
public ArrayList<String> getResult() {
return result;
}
public void run() {// 启动搜索线程
crawl(startUrl, maxUrl, searchString, limitHost, caseSensitive);
}
// 检测URL格式
private URL verifyUrl(/blog_article/String url/index.html) {
// 只处理HTTP URLs.
if (!url.toLowerCase().startsWith("http://"))
return null;
URL verifiedUrl = null;
try {
verifiedUrl = new URL(/blog_article/url/index.html);
} catch (Exception e) {
return null;
}
return verifiedUrl;
}
// 检测robot是否允许访问给出的URL.
private boolean isRobotAllowed(URL urlToCheck) {
String host = urlToCheck.getHost().toLowerCase();// 获取给出RUL的主机
// System.out.println("主机="+host);
// 获取主机不允许搜索的URL缓存
ArrayList<String> disallowList = disallowListCache.get(host);
// 如果还没有缓存,下载并缓存。
if (disallowList == null) {
disallowList = new ArrayList<String>();
try {
URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
BufferedReader reader = new BufferedReader(
new InputStreamReader(robotsFileUrl.openStream()));
// 读robot文件,创建不允许访问的路径列表。
String line;
while ((line = reader.readLine()) != null) {
if (line.indexOf("Disallow:") == 0) {// 是否包含"Disallow:"
String disallowPath = line.substring("Disallow:"
.length());// 获取不允许访问路径
// 检查是否有注释。
int commentIndex = disallowPath.indexOf("#");
if (commentIndex != -1) {
disallowPath = disallowPath.substring(0,
commentIndex);// 去掉注释
}
disallowPath = disallowPath.trim();
disallowList.add(disallowPath);
}
}
// 缓存此主机不允许访问的路径。
disallowListCache.put(host, disallowList);
} catch (Exception e) {
return true; // web站点根目录下没有robots.txt文件,返回真
}
}
String file = urlToCheck.getFile();
// System.out.println("文件getFile()="+file);
for (int i = 0; i < disallowList.size(); i++) {
String disallow = disallowList.get(i);
if (file.startsWith(disallow)) {
return false;
}
}
return true;
}
private String downloadPage(URL pageUrl) {
try {
// Open connection to URL for reading.
BufferedReader reader = new BufferedReader(new InputStreamReader(
pageUrl.openStream()));
// Read page into buffer.
String line;
StringBuffer pageBuffer = new StringBuffer();
while ((line = reader.readLine()) != null) {
pageBuffer.append(line);
}
return pageBuffer.toString();
} catch (Exception e) {
}
return null;
}
// 从URL中去掉"www"
private String removeWwwFromUrl(/blog_article/String url/index.html) {
int index = url.indexOf("://www.");
if (index != -1) {
return url.substring(0, index + 3) + url.substring(index + 7);
}
return (url);
}
// 解析页面并找出链接
@SuppressWarnings("unchecked")
private ArrayList<String> retrieveLinks(URL pageUrl, String pageContents,
HashSet crawledList, boolean limitHost) {
// 用正则表达式编译链接的匹配模式。
Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",
Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(pageContents);
ArrayList<String> linkList = new ArrayList<String>();
while (m.find()) {
String link = m.group(1).trim();
if (link.length() < 1) {
continue;
}
// 跳过链到本页面内链接。
if (link.charAt(0) == '#') {
continue;
}
if (link.indexOf("mailto:") != -1) {
continue;
}
if (link.toLowerCase().indexOf("javascript") != -1) {
continue;
}
if (link.indexOf("://") == -1) {
if (link.charAt(0) == '/') {// 处理绝对地址
link = "http://" + pageUrl.getHost() + ":"
+ pageUrl.getPort() + link;
}else {
String file = pageUrl.getFile();
if (file.indexOf('/') == -1) {// 处理相对地址
link = "http://" + pageUrl.getHost() + ":"
+ pageUrl.getPort() + "/" + link;
} else {
String path = file.substring(0,
file.lastIndexOf('/') + 1);
link = "http://" + pageUrl.getHost() + ":"
+ pageUrl.getPort() + path + link;
}
}
}
int index = link.indexOf('#');
if (index != -1) {
link = link.substring(0, index);
}
link = removeWwwFromUrl(/blog_article/link/index.html);
URL verifiedLink = verifyUrl(/blog_article/link/index.html);
if (verifiedLink == null) {
continue;
}
/* 如果限定主机,排除那些不合条件的URL */
if (limitHost
&& !pageUrl.getHost().toLowerCase().equals(
verifiedLink.getHost().toLowerCase())) {
continue;
}
// 跳过那些已经处理的链接.
if (crawledList.contains(link)) {
continue;
}
linkList.add(link);
}
return (linkList);
}
// 搜索下载Web页面的内容,判断在该页面内有没有指定的搜索字符串
private boolean searchStringMatches(String pageContents,
String searchString, boolean caseSensitive) {
String searchContents = pageContents;
if (!caseSensitive) {// 如果不区分大小写
searchContents = pageContents.toLowerCase();
}
Pattern p = Pattern.compile("[\\s]+");
String[] terms = p.split(searchString);
for (int i = 0; i < terms.length; i++) {
if (caseSensitive) {
if (searchContents.indexOf(terms[i]) == -1) {
return false;
}
} else {
if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
return false;
}
}
}
return true;
}
// 执行实际的搜索操作
public ArrayList<String> crawl(String startUrl, int maxUrls,
String searchString, boolean limithost, boolean caseSensitive) {
HashSet<String> crawledList = new HashSet<String>();
LinkedHashSet<String> toCrawlList = new LinkedHashSet<String>();
if (maxUrls < 1) {
errorList.add("Invalid Max URLs value.");
System.out.println("Invalid Max URLs value.");
}
if (searchString.length() < 1) {
errorList.add("Missing Search String.");
System.out.println("Missing search String");
}
if (errorList.size() > 0) {
System.out.println("err!!!");
return errorList;
}
// 从开始URL中移出www
startUrl = removeWwwFromUrl(/blog_article/startUrl/index.html);
toCrawlList.add(startUrl);
while (toCrawlList.size() > 0) {
if (maxUrls != -1) {
if (crawledList.size() == maxUrls) {
break;
}
}
// Get URL at bottom of the list.
String url = toCrawlList.iterator().next();
// Remove URL from the to crawl list.
toCrawlList.remove(url);
// Convert string url to URL object.
URL verifiedUrl = verifyUrl(url);
// Skip URL if robots are not allowed to access it.
if (!isRobotAllowed(verifiedUrl)) {
continue;
}
// 增加已处理的URL到crawledList
crawledList.add(url);
String pageContents = downloadPage(verifiedUrl);
if (pageContents != null && pageContents.length() > 0) {
// 从页面中获取有效的链接
ArrayList<String> links = retrieveLinks(verifiedUrl,
pageContents, crawledList, limitHost);
toCrawlList.addAll(links);
if (searchStringMatches(pageContents, searchString,
caseSensitive)) {
result.add(url);
System.out.println(url);
}
}
}
return result;
}
// 主函数
public static void main(String[] args) {
SearchCrawler crawler = new SearchCrawler(
"http://www.blogjava.net/Jack2007/", 20, "jack");
Thread search = new Thread(crawler);
System.out.println("Start searching...");
System.out.println("result:");
search.start();
try {
search.join();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
package spider;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SearchCrawler implements Runnable{
/*
* disallowListCache缓存robot不允许搜索的URL。 Robot协议在Web站点的根目录下设置一个robots.txt文件,
* 规定站点上的哪些页面是限制搜索的。 搜索程序应该在搜索过程中跳过这些区域,下面是robots.txt的一个例子: # robots.txt for
* http://somehost.com/ User-agent: Disallow: /cgi-bin/ Disallow:
* /registration # Disallow robots on registration page Disallow: /login
*/
private HashMap<String, ArrayList<String>> disallowListCache = new HashMap<String, ArrayList<String>>();
ArrayList<String> errorList = new ArrayList<String>();// 错误信息
ArrayList<String> result = new ArrayList<String>(); // 搜索到的结果
String startUrl;// 开始搜索的起点
int maxUrl;// 最大处理的url数
String searchString;// 要搜索的字符串(英文)
boolean caseSensitive = false;// 是否区分大小写
boolean limitHost = false;// 是否在限制的主机内搜索
public SearchCrawler(String startUrl, int maxUrl, String searchString) {
this.startUrl = startUrl;
this.maxUrl = maxUrl;
this.searchString = searchString;
}
public ArrayList<String> getResult() {
return result;
}
public void run() {// 启动搜索线程
crawl(startUrl, maxUrl, searchString, limitHost, caseSensitive);
}
// 检测URL格式
private URL verifyUrl(/blog_article/String url/index.html) {
// 只处理HTTP URLs.
if (!url.toLowerCase().startsWith("http://"))
return null;
URL verifiedUrl = null;
try {
verifiedUrl = new URL(/blog_article/url/index.html);
} catch (Exception e) {
return null;
}
return verifiedUrl;
}
// 检测robot是否允许访问给出的URL.
private boolean isRobotAllowed(URL urlToCheck) {
String host = urlToCheck.getHost().toLowerCase();// 获取给出RUL的主机
// System.out.println("主机="+host);
// 获取主机不允许搜索的URL缓存
ArrayList<String> disallowList = disallowListCache.get(host);
// 如果还没有缓存,下载并缓存。
if (disallowList == null) {
disallowList = new ArrayList<String>();
try {
URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
BufferedReader reader = new BufferedReader(
new InputStreamReader(robotsFileUrl.openStream()));
// 读robot文件,创建不允许访问的路径列表。
String line;
while ((line = reader.readLine()) != null) {
if (line.indexOf("Disallow:") == 0) {// 是否包含"Disallow:"
String disallowPath = line.substring("Disallow:"
.length());// 获取不允许访问路径
// 检查是否有注释。
int commentIndex = disallowPath.indexOf("#");
if (commentIndex != -1) {
disallowPath = disallowPath.substring(0,
commentIndex);// 去掉注释
}
disallowPath = disallowPath.trim();
disallowList.add(disallowPath);
}
}
// 缓存此主机不允许访问的路径。
disallowListCache.put(host, disallowList);
} catch (Exception e) {
return true; // web站点根目录下没有robots.txt文件,返回真
}
}
String file = urlToCheck.getFile();
// System.out.println("文件getFile()="+file);
for (int i = 0; i < disallowList.size(); i++) {
String disallow = disallowList.get(i);
if (file.startsWith(disallow)) {
return false;
}
}
return true;
}
private String downloadPage(URL pageUrl) {
try {
// Open connection to URL for reading.
BufferedReader reader = new BufferedReader(new InputStreamReader(
pageUrl.openStream()));
// Read page into buffer.
String line;
StringBuffer pageBuffer = new StringBuffer();
while ((line = reader.readLine()) != null) {
pageBuffer.append(line);
}
return pageBuffer.toString();
} catch (Exception e) {
}
return null;
}
// 从URL中去掉"www"
private String removeWwwFromUrl(/blog_article/String url/index.html) {
int index = url.indexOf("://www.");
if (index != -1) {
return url.substring(0, index + 3) + url.substring(index + 7);
}
return (url);
}
// 解析页面并找出链接
@SuppressWarnings("unchecked")
private ArrayList<String> retrieveLinks(URL pageUrl, String pageContents,
HashSet crawledList, boolean limitHost) {
// 用正则表达式编译链接的匹配模式。
Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",
Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(pageContents);
ArrayList<String> linkList = new ArrayList<String>();
while (m.find()) {
String link = m.group(1).trim();
if (link.length() < 1) {
continue;
}
// 跳过链到本页面内链接。
if (link.charAt(0) == '#') {
continue;
}
if (link.indexOf("mailto:") != -1) {
continue;
}
if (link.toLowerCase().indexOf("javascript") != -1) {
continue;
}
if (link.indexOf("://") == -1) {
if (link.charAt(0) == '/') {// 处理绝对地址
link = "http://" + pageUrl.getHost() + ":"
+ pageUrl.getPort() + link;
}else {
String file = pageUrl.getFile();
if (file.indexOf('/') == -1) {// 处理相对地址
link = "http://" + pageUrl.getHost() + ":"
+ pageUrl.getPort() + "/" + link;
} else {
String path = file.substring(0,
file.lastIndexOf('/') + 1);
link = "http://" + pageUrl.getHost() + ":"
+ pageUrl.getPort() + path + link;
}
}
}
int index = link.indexOf('#');
if (index != -1) {
link = link.substring(0, index);
}
link = removeWwwFromUrl(/blog_article/link/index.html);
URL verifiedLink = verifyUrl(/blog_article/link/index.html);
if (verifiedLink == null) {
continue;
}
/* 如果限定主机,排除那些不合条件的URL */
if (limitHost
&& !pageUrl.getHost().toLowerCase().equals(
verifiedLink.getHost().toLowerCase())) {
continue;
}
// 跳过那些已经处理的链接.
if (crawledList.contains(link)) {
continue;
}
linkList.add(link);
}
return (linkList);
}
// 搜索下载Web页面的内容,判断在该页面内有没有指定的搜索字符串
private boolean searchStringMatches(String pageContents,
String searchString, boolean caseSensitive) {
String searchContents = pageContents;
if (!caseSensitive) {// 如果不区分大小写
searchContents = pageContents.toLowerCase();
}
Pattern p = Pattern.compile("[\\s]+");
String[] terms = p.split(searchString);
for (int i = 0; i < terms.length; i++) {
if (caseSensitive) {
if (searchContents.indexOf(terms[i]) == -1) {
return false;
}
} else {
if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
return false;
}
}
}
return true;
}
// 执行实际的搜索操作
public ArrayList<String> crawl(String startUrl, int maxUrls,
String searchString, boolean limithost, boolean caseSensitive) {
HashSet<String> crawledList = new HashSet<String>();
LinkedHashSet<String> toCrawlList = new LinkedHashSet<String>();
if (maxUrls < 1) {
errorList.add("Invalid Max URLs value.");
System.out.println("Invalid Max URLs value.");
}
if (searchString.length() < 1) {
errorList.add("Missing Search String.");
System.out.println("Missing search String");
}
if (errorList.size() > 0) {
System.out.println("err!!!");
return errorList;
}
// 从开始URL中移出www
startUrl = removeWwwFromUrl(/blog_article/startUrl/index.html);
toCrawlList.add(startUrl);
while (toCrawlList.size() > 0) {
if (maxUrls != -1) {
if (crawledList.size() == maxUrls) {
break;
}
}
// Get URL at bottom of the list.
String url = toCrawlList.iterator().next();
// Remove URL from the to crawl list.
toCrawlList.remove(url);
// Convert string url to URL object.
URL verifiedUrl = verifyUrl(url);
// Skip URL if robots are not allowed to access it.
if (!isRobotAllowed(verifiedUrl)) {
continue;
}
// 增加已处理的URL到crawledList
crawledList.add(url);
String pageContents = downloadPage(verifiedUrl);
if (pageContents != null && pageContents.length() > 0) {
// 从页面中获取有效的链接
ArrayList<String> links = retrieveLinks(verifiedUrl,
pageContents, crawledList, limitHost);
toCrawlList.addAll(links);
if (searchStringMatches(pageContents, searchString,
caseSensitive)) {
result.add(url);
System.out.println(url);
}
}
}
return result;
}
// 主函数
public static void main(String[] args) {
SearchCrawler crawler = new SearchCrawler(
"http://www.blogjava.net/Jack2007/", 20, "jack");
Thread search = new Thread(crawler);
System.out.println("Start searching...");
System.out.println("result:");
search.start();
try {
search.join();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
最新技术文章: