tidyXML: Downloads a web page’s HTML source and runs it through HTML Tidy, converting it to XML. Works great for doing quick and easy web scraping: just load the resulting XML into a module such as libxmljs and use XPath to select and retrieve the desired data. [Example]
xml2js: Neat utility that converts an XML string to a Javascript object using libxmljs.
spellcheck: Spell checks the specified text using Google and executes the specified callback when the result (in XML) is ready or when there is an error.
var libxml=require('./libxmljs');
function xml2js(obj) {
if (typeof obj == 'string') {
var doc = libxml.parseXmlString(obj);
if (!doc)
return false;
obj = doc.root();
}
var jsobj = {};
var children = obj.childNodes();
var attributes = obj.attrs();
if (attributes.length > 0)
jsobj['@'] = {};
for (var i=0,atlen=attributes.length; i<atlen; i++)
jsobj['@'][attributes[i].name()] = attributes[i].value();
jsobj['#'] = "";
for (var i=0,chlen=children.length; i<chlen; i++) {
// Begin "text" kludge ------
if (children[i].name() == 'text') {
jsobj['#'] = children[i].text().replace(/^\s*/, "").replace(/\s*$/, "");
for (var j=0,chtlen=children[i].childNodes().length; j<chtlen; j++) {
if (children[i].child(j).name() == 'text') {
var text = {}, textattrs = children[i].child(j).attrs();
text['#'] = children[i].child(j).text();
if (textattrs.length > 0)
text['@'] = {};
for (var k=0,atlen=textattrs.length; k<atlen; i++)
text['@'][textattrs[k].name()] = textattrs[k].value();
jsobj['text'] = text;
break; // only allow one "<text></text>" element for now
}
}
continue;
}
// End "text" kludge --------
if (typeof jsobj[children[i].name()] == 'undefined')
jsobj[children[i].name()] = xml2js(children[i]);
else {
if (typeof jsobj[children[i].name()].length == 'undefined') {
var old = jsobj[children[i].name()];
jsobj[children[i].name()] = [];
jsobj[children[i].name()].push(old);
}
jsobj[children[i].name()].push(xml2js(children[i]));
}
}
return jsobj;
}
function spellcheck(text, cb) {
var respBody = "";
var isError = false;
var reqbody = "<?xml version=\"1.0\" encoding=\"utf-8\" ?> \
<spellrequest textalreadyclipped=\"0\" ignoredups=\"0\" ignoredigits=\"1\" \
ignoreallcaps=\"1\"> \
<text>" + text + "</text> \
</spellrequest>";
var request = require('http').createClient(443, 'www.google.com', true)
.request('POST', '/tbproxy/spell?lang=en&hl=en',
{'host': 'www.google.com',
'Content-Length': reqbody.length});
request.write(reqbody);
request.addListener('response', function (response) {
if (response.statusCode != 200) {
isError = true;
cb(null, response.statusCode);
} else {
response.setEncoding('utf8');
response.addListener('data', function (chunk) {
/* if request text contains errors, response will contain "c" tags/nodes with attributes:
o: the offset from the start of the text of the word
l: length of misspelled word
s: Confidence of the suggestion
tag/node content: tab-delimited list of suggestions
*/
respBody += chunk;
});
response.addListener('end', function() {
if (!isError)
cb(respBody);
});
}
});
request.end();
}