@Grapes([ @Grab(group='org.ccil.cowan.tagsoup', module='tagsoup', version='1.2') ,@Grab(group='net.sourceforge.nekohtml', module='nekohtml', version='1.9.14') //,@Grab(group='org.htmlcleaner', module='htmlcleaner', version='2.2') ] ) import org.ccil.cowan.tagsoup.* //TagSoup import org.cyberneko.html.parsers.SAXParser //NekoHTML import org.htmlcleaner.* //not in maven -- http://htmlcleaner.sourceforge.net/, http://dist.wso2.org/maven2/org/htmlcleaner/htmlcleaner/ def url = new URL("http://lifehacker.com") println "*" * 15 + 'TAG SOUP' + "*" * 15 slurper = new XmlSlurper(new Parser()) url.withReader { reader -> html = slurper.parse(reader) println findLinks(html).join('\n') } println "*" * 15 + 'NekoHTML' + "*" * 15 def parser = new SAXParser() parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",true) html = new XmlSlurper(parser).parseText(url.text) println findLinks(html).join('\n') /* println "*" * 15 + 'HTMLCleaner' + "*" * 15 def cleaner = new HtmlCleaner() def node = cleaner.clean(url) def props = cleaner.getProperties() def serializer = new SimpleXmlSerializer(props) def xml = serializer.getXmlAsString(node) def page = new XmlSlurper(false,false).parseText(xml) println findLinks(page).join('\n') */ /// UTILS Set findLinks(def root) { int cnt = 0 def posts = root?.depthFirst()?.find { it.@id.text() == "splashPosts"} Set links = posts?.depthFirst()?.findAll { it.name().toLowerCase() == "a" && !it.@href.text().contains('/tag/') }.collect([]) { cnt++; it.@href.text() - '?skyline=true&s=i'} as Set println "found $cnt" return links.sort() }
http://groovyconsole.appspot.com/script/448003
BONUS: CSS Selectors with JSoup
@Grapes( @Grab(group='org.jsoup', module='jsoup', version='1.6.1' )) def doc = org.jsoup.Jsoup.connect("http://www.bing.com/search?q=web+scraping").get() println 'start' doc.select("#results h3 a").each { node -> println '-->' + node.text() + ' == ' + node.attr('href') } println 'done'
BONUS: XPath Selectors with TagSoup
@Grapes( @Grab('org.ccil.cowan.tagsoup:tagsoup:1.2') )
import org.ccil.cowan.tagsoup.Parser; import org.xml.sax.*; import javax.xml.transform.*; import javax.xml.transform.dom.*; import javax.xml.transform.sax.*; import javax.xml.xpath.* def urlString = "http://www.bing.com/search?q=web+scraping" URL url = new URL(urlString); XMLReader reader = new Parser(); //Transform SAX to DOM reader.setFeature(Parser.namespacesFeature, false); reader.setFeature(Parser.namespacePrefixesFeature, false); Transformer transformer = TransformerFactory.newInstance().newTransformer(); DOMResult result = new DOMResult(); transformer.transform(new SAXSource(reader, new InputSource(url.openStream())), result); def xpath = XPathFactory.newInstance().newXPath() //CSS selector: $('#results h3 a') def results = xpath.evaluate( '//*[@id=\'results\']//h3/a', result.getNode(), XPathConstants.NODESET ) results.each { println it }
println 'done'
No comments:
Post a Comment