JavaScript Unicode UTF-8

标签: , , , , ,

JavaScript 字符串使用的是 Unicode 编码,实现方式是 UTF-16 ,每个字符占用两个字节,至于是 Big-Endian 还是 Little-Endian ,似乎应该和具体实现有关,不过这并不重要。JavaScript无法直接操作单个字节,所以只能用双字节的 UTF-16 来模拟 UTF-8 ,在内存中的字节并不是真正意义的 UTF-8 。

function utf16to8(str) {
    var out, i, len, c;

    out = "";
    len = str.length;
    for(i = 0; i < len; i++) {
    c = str.charCodeAt(i);
    if ((c >= 0x0001) && (c <= 0x007F)) {
        out += str.charAt(i);
    } else if (c > 0x07FF) {
        out += String.fromCharCode(0xE0 | ((c >> 12) & 0x0F));
        out += String.fromCharCode(0x80 | ((c >>  6) & 0x3F));
        out += String.fromCharCode(0x80 | ((c >>  0) & 0x3F));
    } else {
        out += String.fromCharCode(0xC0 | ((c >>  6) & 0x1F));
        out += String.fromCharCode(0x80 | ((c >>  0) & 0x3F));
    }
    }
    return out;
}

function utf8to16(str) {
    var out, i, len, c;
    var char2, char3;

    out = "";
    len = str.length;
    i = 0;
    while(i < len) {
    c = str.charCodeAt(i++);
    switch(c >> 4)
    { 
      case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
        // 0xxxxxxx
        out += str.charAt(i-1);
        break;
      case 12: case 13:
        // 110x xxxx   10xx xxxx
        char2 = str.charCodeAt(i++);
        out += String.fromCharCode(((c & 0x1F) << 6) | (char2 & 0x3F));
        break;
      case 14:
        // 1110 xxxx  10xx xxxx  10xx xxxx
        char2 = str.charCodeAt(i++);
        char3 = str.charCodeAt(i++);
        out += String.fromCharCode(((c & 0x0F) << 12) |
                       ((char2 & 0x3F) << 6) |
                       ((char3 & 0x3F) << 0));
        break;
    }
    }

    return out;
}

跟 VBS 一样, JavaScript 字符串连接符+的效率也是很低的,如果要提高效率的话,应该用数组来实现。

赞赏

微信赞赏支付宝赞赏

随机文章:

  1. 自甘堕落
  2. ___security_cookie
  3. 为HG255D编译OpenWrt Barrier Breaker固件
  4. 没有main函数的C程序
  5. 在Windows 2008 R2下安装PSDK

留下回复