使用webmagic实现爬虫程序示例分享
本文导语: 代码如下:package com.letv.cloud.spider; import java.util.HashSet;import java.util.List; import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.processor.PageProcessor; public class MoviePaperPageProcessor implement...
package com.letv.cloud.spider;
import java.util.HashSet;
import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
public class MoviePaperPageProcessor implements PageProcessor {
private Site page = Site.me().setRetryTimes(3).setSleepTime(1000);
public Site getSite() {
return page;
}
public void process(Page page) {
List links = page.getHtml().links().regex(
"http://posters.aa.com/poster/\d+").all();
links = removeDuplicate(links);
page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath(
"//div[@id='imdbleftsecc']/center/h1/text()").toString());
page.putField("imgurl", page.getHtml().xpath(
"//div[@id='imdbleftsecc']/center/img/@src").toString());
}
public static void main(String[] args) {
for (int i = 1; i