303 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
			
		
		
	
	
			303 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
| "use strict";
 | |
| const whatwgEncoding = require("whatwg-encoding");
 | |
| 
 | |
| // https://html.spec.whatwg.org/#encoding-sniffing-algorithm
 | |
| module.exports = function sniffHTMLEncoding(buffer, options) {
 | |
|   let encoding = whatwgEncoding.getBOMEncoding(buffer); // see https://github.com/whatwg/html/issues/1910
 | |
| 
 | |
|   if (options === undefined) {
 | |
|     options = {};
 | |
|   }
 | |
| 
 | |
|   if (encoding === null && options.transportLayerEncodingLabel !== undefined) {
 | |
|     encoding = whatwgEncoding.labelToName(options.transportLayerEncodingLabel);
 | |
|   }
 | |
| 
 | |
|   if (encoding === null) {
 | |
|     encoding = prescanMetaCharset(buffer);
 | |
|   }
 | |
| 
 | |
|   if (encoding === null && options.defaultEncoding !== undefined) {
 | |
|     encoding = options.defaultEncoding;
 | |
|   }
 | |
| 
 | |
|   if (encoding === null) {
 | |
|     encoding = "windows-1252";
 | |
|   }
 | |
| 
 | |
|   return encoding;
 | |
| };
 | |
| 
 | |
| // https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
 | |
| function prescanMetaCharset(buffer) {
 | |
|   const l = Math.min(buffer.length, 1024);
 | |
|   for (let i = 0; i < l; i++) {
 | |
|     let c = buffer[i];
 | |
|     if (c === 0x3C) {
 | |
|       // "<"
 | |
|       let c1 = buffer[i + 1];
 | |
|       let c2 = buffer[i + 2];
 | |
|       const c3 = buffer[i + 3];
 | |
|       const c4 = buffer[i + 4];
 | |
|       const c5 = buffer[i + 5];
 | |
|       // !-- (comment start)
 | |
|       if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) {
 | |
|         i += 4;
 | |
|         for (; i < l; i++) {
 | |
|           c = buffer[i];
 | |
|           c1 = buffer[i + 1];
 | |
|           c2 = buffer[i + 2];
 | |
|           // --> (comment end)
 | |
|           if (c === 0x2D && c1 === 0x2D && c2 === 0x3E) {
 | |
|             i += 2;
 | |
|             break;
 | |
|           }
 | |
|         }
 | |
|       } else if ((c1 === 0x4D || c1 === 0x6D) &&
 | |
|          (c2 === 0x45 || c2 === 0x65) &&
 | |
|          (c3 === 0x54 || c3 === 0x74) &&
 | |
|          (c4 === 0x41 || c4 === 0x61) &&
 | |
|          (isSpaceCharacter(c5) || c5 === 0x2F)) {
 | |
|         // "meta" + space or /
 | |
|         i += 6;
 | |
|         let gotPragma = false;
 | |
|         let needPragma = null;
 | |
|         let charset = null;
 | |
| 
 | |
|         let attrRes;
 | |
|         do {
 | |
|           attrRes = getAttribute(buffer, i, l);
 | |
|           if (attrRes.attr) {
 | |
|             if (attrRes.attr.name === "http-equiv") {
 | |
|               gotPragma = attrRes.attr.value === "content-type";
 | |
|             } else if (attrRes.attr.name === "content" && !charset) {
 | |
|               charset = extractCharacterEncodingFromMeta(attrRes.attr.value);
 | |
|               if (charset !== null) {
 | |
|                 needPragma = true;
 | |
|               }
 | |
|             } else if (attrRes.attr.name === "charset") {
 | |
|               charset = whatwgEncoding.labelToName(attrRes.attr.value);
 | |
|               needPragma = false;
 | |
|             }
 | |
|           }
 | |
|           i = attrRes.i;
 | |
|         } while (attrRes.attr);
 | |
| 
 | |
|         if (needPragma === null) {
 | |
|           continue;
 | |
|         }
 | |
|         if (needPragma === true && gotPragma === false) {
 | |
|           continue;
 | |
|         }
 | |
|         if (charset === null) {
 | |
|           continue;
 | |
|         }
 | |
| 
 | |
|         if (charset === "UTF-16LE" || charset === "UTF-16BE") {
 | |
|           charset = "UTF-8";
 | |
|         }
 | |
|         if (charset === "x-user-defined") {
 | |
|           charset = "windows-1252";
 | |
|         }
 | |
| 
 | |
|         return charset;
 | |
|       } else if ((c1 >= 0x41 && c1 <= 0x5A) || (c1 >= 0x61 && c1 <= 0x7A)) {
 | |
|         // a-z or A-Z
 | |
|         for (i += 2; i < l; i++) {
 | |
|           c = buffer[i];
 | |
|           // space or >
 | |
|           if (isSpaceCharacter(c) || c === 0x3E) {
 | |
|             break;
 | |
|           }
 | |
|         }
 | |
|         let attrRes;
 | |
|         do {
 | |
|           attrRes = getAttribute(buffer, i, l);
 | |
|           i = attrRes.i;
 | |
|         } while (attrRes.attr);
 | |
|       } else if (c1 === 0x21 || c1 === 0x2F || c1 === 0x3F) {
 | |
|         // ! or / or ?
 | |
|         for (i += 2; i < l; i++) {
 | |
|           c = buffer[i];
 | |
|           // >
 | |
|           if (c === 0x3E) {
 | |
|             break;
 | |
|           }
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   return null;
 | |
| }
 | |
| 
 | |
| // https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing
 | |
| function getAttribute(buffer, i, l) {
 | |
|   for (; i < l; i++) {
 | |
|     let c = buffer[i];
 | |
|     // space or /
 | |
|     if (isSpaceCharacter(c) || c === 0x2F) {
 | |
|       continue;
 | |
|     }
 | |
|     // ">"
 | |
|     if (c === 0x3E) {
 | |
|       i++;
 | |
|       break;
 | |
|     }
 | |
|     let name = "";
 | |
|     let value = "";
 | |
|     nameLoop:for (; i < l; i++) {
 | |
|       c = buffer[i];
 | |
|       // "="
 | |
|       if (c === 0x3D && name !== "") {
 | |
|         i++;
 | |
|         break;
 | |
|       }
 | |
|       // space
 | |
|       if (isSpaceCharacter(c)) {
 | |
|         for (i++; i < l; i++) {
 | |
|           c = buffer[i];
 | |
|           // space
 | |
|           if (isSpaceCharacter(c)) {
 | |
|             continue;
 | |
|           }
 | |
|           // not "="
 | |
|           if (c !== 0x3D) {
 | |
|             return { attr: { name, value }, i };
 | |
|           }
 | |
| 
 | |
|           i++;
 | |
|           break nameLoop;
 | |
|         }
 | |
|         break;
 | |
|       }
 | |
|       // / or >
 | |
|       if (c === 0x2F || c === 0x3E) {
 | |
|         return { attr: { name, value }, i };
 | |
|       }
 | |
|       // A-Z
 | |
|       if (c >= 0x41 && c <= 0x5A) {
 | |
|         name += String.fromCharCode(c + 0x20); // lowercase
 | |
|       } else {
 | |
|         name += String.fromCharCode(c);
 | |
|       }
 | |
|     }
 | |
|     c = buffer[i];
 | |
|     // space
 | |
|     if (isSpaceCharacter(c)) {
 | |
|       for (i++; i < l; i++) {
 | |
|         c = buffer[i];
 | |
|         // space
 | |
|         if (isSpaceCharacter(c)) {
 | |
|           continue;
 | |
|         } else {
 | |
|           break;
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|     // " or '
 | |
|     if (c === 0x22 || c === 0x27) {
 | |
|       const quote = c;
 | |
|       for (i++; i < l; i++) {
 | |
|         c = buffer[i];
 | |
| 
 | |
|         if (c === quote) {
 | |
|           i++;
 | |
|           return { attr: { name, value }, i };
 | |
|         }
 | |
| 
 | |
|         // A-Z
 | |
|         if (c >= 0x41 && c <= 0x5A) {
 | |
|           value += String.fromCharCode(c + 0x20); // lowercase
 | |
|         } else {
 | |
|           value += String.fromCharCode(c);
 | |
|         }
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     // >
 | |
|     if (c === 0x3E) {
 | |
|       return { attr: { name, value }, i };
 | |
|     }
 | |
| 
 | |
|     // A-Z
 | |
|     if (c >= 0x41 && c <= 0x5A) {
 | |
|       value += String.fromCharCode(c + 0x20); // lowercase
 | |
|     } else {
 | |
|       value += String.fromCharCode(c);
 | |
|     }
 | |
| 
 | |
|     for (i++; i < l; i++) {
 | |
|       c = buffer[i];
 | |
| 
 | |
|       // space or >
 | |
|       if (isSpaceCharacter(c) || c === 0x3E) {
 | |
|         return { attr: { name, value }, i };
 | |
|       }
 | |
| 
 | |
|       // A-Z
 | |
|       if (c >= 0x41 && c <= 0x5A) {
 | |
|         value += String.fromCharCode(c + 0x20); // lowercase
 | |
|       } else {
 | |
|         value += String.fromCharCode(c);
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   return { i };
 | |
| }
 | |
| 
 | |
| function extractCharacterEncodingFromMeta(string) {
 | |
|   let position = 0;
 | |
| 
 | |
|   while (true) {
 | |
|     let subPosition = string.substring(position).search(/charset/i);
 | |
| 
 | |
|     if (subPosition === -1) {
 | |
|       return null;
 | |
|     }
 | |
|     subPosition += "charset".length;
 | |
| 
 | |
|     while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
 | |
|       ++subPosition;
 | |
|     }
 | |
| 
 | |
|     if (string[subPosition] !== "=") {
 | |
|       position = subPosition - 1;
 | |
|       continue;
 | |
|     }
 | |
| 
 | |
|     ++subPosition;
 | |
| 
 | |
|     while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
 | |
|       ++subPosition;
 | |
|     }
 | |
| 
 | |
|     position = subPosition;
 | |
|     break;
 | |
|   }
 | |
| 
 | |
|   if (string[position] === "\"" || string[position] === "'") {
 | |
|     const nextIndex = string.indexOf(string[position], position + 1);
 | |
| 
 | |
|     if (nextIndex !== -1) {
 | |
|       return whatwgEncoding.labelToName(string.substring(position + 1, nextIndex));
 | |
|     }
 | |
| 
 | |
|     // It is an unmatched quotation mark
 | |
|     return null;
 | |
|   }
 | |
| 
 | |
|   if (string.length === position + 1) {
 | |
|     return null;
 | |
|   }
 | |
| 
 | |
|   let end = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/);
 | |
|   if (end === -1) {
 | |
|     end = string.length;
 | |
|   }
 | |
|   return whatwgEncoding.labelToName(string.substring(position, end));
 | |
| }
 | |
| 
 | |
| function isSpaceCharacter(c) {
 | |
|   return c === 0x09 || c === 0x0A || c === 0x0C || c === 0x0D || c === 0x20;
 | |
| }
 |