@Grapes([
@Grab(group='org.ccil.cowan.tagsoup', module='tagsoup', version='1.2')
,@Grab(group='net.sourceforge.nekohtml', module='nekohtml', version='1.9.14')
//,@Grab(group='org.htmlcleaner', module='htmlcleaner', version='2.2')
]
)
import org.ccil.cowan.tagsoup.* //TagSoup
import org.cyberneko.html.parsers.SAXParser //NekoHTML
import org.htmlcleaner.* //not in maven -- http://htmlcleaner.sourceforge.net/, http://dist.wso2.org/maven2/org/htmlcleaner/htmlcleaner/
def url = new URL("http://lifehacker.com")
println "*" * 15 + 'TAG SOUP' + "*" * 15
slurper = new XmlSlurper(new Parser())
url.withReader { reader ->
html = slurper.parse(reader)
println findLinks(html).join('\n')
}
println "*" * 15 + 'NekoHTML' + "*" * 15
def parser = new SAXParser()
parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",true)
html = new XmlSlurper(parser).parseText(url.text)
println findLinks(html).join('\n')
/*
println "*" * 15 + 'HTMLCleaner' + "*" * 15
def cleaner = new HtmlCleaner()
def node = cleaner.clean(url)
def props = cleaner.getProperties()
def serializer = new SimpleXmlSerializer(props)
def xml = serializer.getXmlAsString(node)
def page = new XmlSlurper(false,false).parseText(xml)
println findLinks(page).join('\n')
*/
/// UTILS
Set findLinks(def root) {
int cnt = 0
def posts = root?.depthFirst()?.find { it.@id.text() == "splashPosts"}
Set links = posts?.depthFirst()?.findAll { it.name().toLowerCase() == "a" && !it.@href.text().contains('/tag/') }.collect([]) { cnt++; it.@href.text() - '?skyline=true&s=i'} as Set
println "found $cnt"
return links.sort()
}
http://groovyconsole.appspot.com/script/448003
BONUS: CSS Selectors with JSoup
@Grapes( @Grab(group='org.jsoup', module='jsoup', version='1.6.1' ))
def doc = org.jsoup.Jsoup.connect("http://www.bing.com/search?q=web+scraping").get()
println 'start'
doc.select("#results h3 a").each { node ->
println '-->' + node.text() + ' == ' + node.attr('href')
}
println 'done'
BONUS: XPath Selectors with TagSoup
@Grapes( @Grab('org.ccil.cowan.tagsoup:tagsoup:1.2') ) import org.ccil.cowan.tagsoup.Parser;
import org.xml.sax.*;
import javax.xml.transform.*;
import javax.xml.transform.dom.*;
import javax.xml.transform.sax.*;
import javax.xml.xpath.*
def urlString = "http://www.bing.com/search?q=web+scraping"
URL url = new URL(urlString);
XMLReader reader = new Parser();
//Transform SAX to DOM
reader.setFeature(Parser.namespacesFeature, false);
reader.setFeature(Parser.namespacePrefixesFeature, false);
Transformer transformer = TransformerFactory.newInstance().newTransformer();
DOMResult result = new DOMResult();
transformer.transform(new SAXSource(reader, new InputSource(url.openStream())), result);
def xpath = XPathFactory.newInstance().newXPath()
//CSS selector: $('#results h3 a')
def results = xpath.evaluate( '//*[@id=\'results\']//h3/a', result.getNode(), XPathConstants.NODESET )
results.each { println it }println 'done'
No comments:
Post a Comment