Code Snippets

node.js-related

  • tidyXML: Downloads a web page’s HTML source and runs it through HTML Tidy, converting it to XML. Works great for doing quick and easy web scraping: just load the resulting XML into a module such as libxmljs and use XPath to select and retrieve the desired data. [Example]
  • xml2js: Neat utility that converts an XML string to a Javascript object using libxmljs.
  • spellcheck: Spell checks the specified text using Google and executes the specified callback when the result (in XML) is ready or when there is an error.

var libxml=require('./libxmljs');

function xml2js(obj) {
  if (typeof obj == 'string') {
    var doc = libxml.parseXmlString(obj);
    if (!doc)
      return false;
    obj = doc.root();
  }

  var jsobj = {};
  var children = obj.childNodes();
  var attributes = obj.attrs();
  
  if (attributes.length > 0)
    jsobj['@'] = {};
  for (var i=0,atlen=attributes.length; i<atlen; i++)
    jsobj['@'][attributes[i].name()] = attributes[i].value();

  jsobj['#'] = "";

  for (var i=0,chlen=children.length; i<chlen; i++) {
    // Begin "text" kludge ------
    if (children[i].name() == 'text') {
      jsobj['#'] = children[i].text().replace(/^\s*/, "").replace(/\s*$/, "");
      for (var j=0,chtlen=children[i].childNodes().length; j<chtlen; j++) {
        if (children[i].child(j).name() == 'text') {
          var text = {}, textattrs = children[i].child(j).attrs();
          text['#'] = children[i].child(j).text();
          if (textattrs.length > 0)
            text['@'] = {};
          for (var k=0,atlen=textattrs.length; k<atlen; i++)
            text['@'][textattrs[k].name()] = textattrs[k].value();
          jsobj['text'] = text;
          break; // only allow one "<text></text>" element for now
        }
      }
      continue;
    }
    // End "text" kludge --------
    if (typeof jsobj[children[i].name()] == 'undefined')
      jsobj[children[i].name()] = xml2js(children[i]);
    else {
      if (typeof jsobj[children[i].name()].length == 'undefined') {
        var old = jsobj[children[i].name()];
        jsobj[children[i].name()] = [];
        jsobj[children[i].name()].push(old);
      }
      jsobj[children[i].name()].push(xml2js(children[i]));
    }
  }
  return jsobj;
}
	

function spellcheck(text, cb) {
  var respBody = "";
  var isError = false;
  var reqbody = "<?xml version=\"1.0\" encoding=\"utf-8\" ?> \
                 <spellrequest textalreadyclipped=\"0\" ignoredups=\"0\" ignoredigits=\"1\" \
                               ignoreallcaps=\"1\"> \
                   <text>" + text + "</text> \
                 </spellrequest>";
  var request = require('http').createClient(443, 'www.google.com', true)
                               .request('POST', '/tbproxy/spell?lang=en&hl=en', 
                                        {'host': 'www.google.com',
                                         'Content-Length': reqbody.length});
  request.write(reqbody);
  request.addListener('response', function (response) {
    if (response.statusCode != 200) {
      isError = true;
      cb(null, response.statusCode);
    } else {
      response.setEncoding('utf8');
      response.addListener('data', function (chunk) {
        /* if request text contains errors, response will contain "c" tags/nodes with attributes:
            o: the offset from the start of the text of the word 
            l: length of misspelled word 
            s: Confidence of the suggestion 
            tag/node content: tab-delimited list of suggestions
        */
        respBody += chunk;
      });
      response.addListener('end', function() {
        if (!isError)
          cb(respBody);
      });
    }
  });
  request.end();
}
	

exports.tidyXML = function(url, callback, config) {
  require('http').cat(url, function(err, content) {
    if (err !== null) {
      if (typeof err == 'number')
        callback("Received HTTP status code " + err);
      else
        callback(err);
    } else {
      var tidyArgs = ['-quiet', '--output-xml', 'yes', '--tidy-mark', 'no', '--show-warnings', 'no',
                      '--bare', 'yes', '--merge-divs', 'no', '--merge-spans', 'no', '--wrap', '0',
                      '--fix-bad-comments', 'yes', '--fix-uri', 'yes', '--output-bom', 'no',
                      '--clean', 'yes', '--drop-proprietary-attributes', 'yes', '--force-output', 'yes',
                      '--add-xml-decl', 'yes', '--doctype', 'omit'];
      if (typeof config == 'object' && typeof config.inputEncoding == 'string') {
        tidyArgs.push('--input-encoding');
        tidyArgs.push(config.inputEncoding);
      }

      tidyArgs.push('--output-encoding');
      if (typeof config == 'object' && typeof config.outputEncoding == 'string')
        tidyArgs.push(config.outputEncoding);
      else
        tidyArgs.push('latin1');
      
      var tidyProc = require('child_process').spawn('tidy', tidyArgs);
      var stdout = "";
      
      tidyProc.stdout.addListener('data', function(data) {
        stdout += data;
      });
      tidyProc.stdin.addListener('error', function(err) {
        callback('tidy is not installed');
      });
      tidyProc.addListener('exit', function(code) {
        if (code != 127)
          callback(null, stdout);
      });

      tidyProc.stdin.end(content);
    }
  });
};