Friday, October 8, 2010

Fast and correct htmlEncoding for JavaScript

Feel free to adjust two last matches in the regexp to any of your needs. I encode all > 0x07 and from allowed ascii, I encode only big 5. Using regexp is still faster (specially on MSIE) than any other impl. var _encodeHtmlRegExpImpl = (function() { // performance is 78ms on MSIE 7 (the slowest one) // on 80KB html markup from: http://www.w3.org/TR/html4/ var re = new RegExp( // surrogate pair (sp) "([\uD800-\uDBFF][\uDC00-\uDFFF])" + // html UNUSED including standalone surogates (un) "|([\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F\uD800-\uDFFF])" + // out of ascii (oa) "|([^\u0000-\u007F])" + // big 5 + add others (b5) "|([\u0022\u0026\u0027\u003C\u003E])", "g" ), toCodePoint = function(high, low) { return ((high - 0xD800) << 10) + (low - 0xDC00) + 0x010000; }, enc = function(m, sp, un, oa, b5) { // extracted out from main function and ifs changed to ternary // thanx to Andrea Giammarchi return "&#" + (oa || b5 ? m.charCodeAt(0) : (un ? "xFFFD" : toCodePoint(m.charCodeAt(0), m.charCodeAt(1)))) + ";"; }; return function(s) { return s.replace(re, enc); } } ());

No comments:

Post a Comment