123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634 |
- //[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
- //[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
- //[5] Name ::= NameStartChar (NameChar)*
- var nameStartChar = /[A-Z_a-z\xC0-\xD6\xD8-\xF6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]///\u10000-\uEFFFF
- var nameChar = new RegExp("[\\-\\.0-9"+nameStartChar.source.slice(1,-1)+"\\u00B7\\u0300-\\u036F\\u203F-\\u2040]");
- var tagNamePattern = new RegExp('^'+nameStartChar.source+nameChar.source+'*(?:\:'+nameStartChar.source+nameChar.source+'*)?$');
- //var tagNamePattern = /^[a-zA-Z_][\w\-\.]*(?:\:[a-zA-Z_][\w\-\.]*)?$/
- //var handlers = 'resolveEntity,getExternalSubset,characters,endDocument,endElement,endPrefixMapping,ignorableWhitespace,processingInstruction,setDocumentLocator,skippedEntity,startDocument,startElement,startPrefixMapping,notationDecl,unparsedEntityDecl,error,fatalError,warning,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,comment,endCDATA,endDTD,endEntity,startCDATA,startDTD,startEntity'.split(',')
- //S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE
- //S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE
- var S_TAG = 0;//tag name offerring
- var S_ATTR = 1;//attr name offerring
- var S_ATTR_SPACE=2;//attr name end and space offer
- var S_EQ = 3;//=space?
- var S_ATTR_NOQUOT_VALUE = 4;//attr value(no quot value only)
- var S_ATTR_END = 5;//attr value end and no space(quot end)
- var S_TAG_SPACE = 6;//(attr value end || tag end ) && (space offer)
- var S_TAG_CLOSE = 7;//closed el<el />
- function XMLReader(){
-
- }
- XMLReader.prototype = {
- parse:function(source,defaultNSMap,entityMap){
- var domBuilder = this.domBuilder;
- domBuilder.startDocument();
- _copy(defaultNSMap ,defaultNSMap = {})
- parse(source,defaultNSMap,entityMap,
- domBuilder,this.errorHandler);
- domBuilder.endDocument();
- }
- }
- function parse(source,defaultNSMapCopy,entityMap,domBuilder,errorHandler){
- function fixedFromCharCode(code) {
- // String.prototype.fromCharCode does not supports
- // > 2 bytes unicode chars directly
- if (code > 0xffff) {
- code -= 0x10000;
- var surrogate1 = 0xd800 + (code >> 10)
- , surrogate2 = 0xdc00 + (code & 0x3ff);
- return String.fromCharCode(surrogate1, surrogate2);
- } else {
- return String.fromCharCode(code);
- }
- }
- function entityReplacer(a){
- var k = a.slice(1,-1);
- if(k in entityMap){
- return entityMap[k];
- }else if(k.charAt(0) === '#'){
- return fixedFromCharCode(parseInt(k.substr(1).replace('x','0x')))
- }else{
- errorHandler.error('entity not found:'+a);
- return a;
- }
- }
- function appendText(end){//has some bugs
- if(end>start){
- var xt = source.substring(start,end).replace(/&#?\w+;/g,entityReplacer);
- locator&&position(start);
- domBuilder.characters(xt,0,end-start);
- start = end
- }
- }
- function position(p,m){
- while(p>=lineEnd && (m = linePattern.exec(source))){
- lineStart = m.index;
- lineEnd = lineStart + m[0].length;
- locator.lineNumber++;
- //console.log('line++:',locator,startPos,endPos)
- }
- locator.columnNumber = p-lineStart+1;
- }
- var lineStart = 0;
- var lineEnd = 0;
- var linePattern = /.*(?:\r\n?|\n)|.*$/g
- var locator = domBuilder.locator;
-
- var parseStack = [{currentNSMap:defaultNSMapCopy}]
- var closeMap = {};
- var start = 0;
- while(true){
- try{
- var tagStart = source.indexOf('<',start);
- if(tagStart<0){
- if(!source.substr(start).match(/^\s*$/)){
- var doc = domBuilder.doc;
- var text = doc.createTextNode(source.substr(start));
- doc.appendChild(text);
- domBuilder.currentElement = text;
- }
- return;
- }
- if(tagStart>start){
- appendText(tagStart);
- }
- switch(source.charAt(tagStart+1)){
- case '/':
- var end = source.indexOf('>',tagStart+3);
- var tagName = source.substring(tagStart+2,end);
- var config = parseStack.pop();
- if(end<0){
-
- tagName = source.substring(tagStart+2).replace(/[\s<].*/,'');
- //console.error('#@@@@@@'+tagName)
- errorHandler.error("end tag name: "+tagName+' is not complete:'+config.tagName);
- end = tagStart+1+tagName.length;
- }else if(tagName.match(/\s</)){
- tagName = tagName.replace(/[\s<].*/,'');
- errorHandler.error("end tag name: "+tagName+' maybe not complete');
- end = tagStart+1+tagName.length;
- }
- //console.error(parseStack.length,parseStack)
- //console.error(config);
- var localNSMap = config.localNSMap;
- var endMatch = config.tagName == tagName;
- var endIgnoreCaseMach = endMatch || config.tagName&&config.tagName.toLowerCase() == tagName.toLowerCase()
- if(endIgnoreCaseMach){
- domBuilder.endElement(config.uri,config.localName,tagName);
- if(localNSMap){
- for(var prefix in localNSMap){
- domBuilder.endPrefixMapping(prefix) ;
- }
- }
- if(!endMatch){
- errorHandler.fatalError("end tag name: "+tagName+' is not match the current start tagName:'+config.tagName );
- }
- }else{
- parseStack.push(config)
- }
-
- end++;
- break;
- // end elment
- case '?':// <?...?>
- locator&&position(tagStart);
- end = parseInstruction(source,tagStart,domBuilder);
- break;
- case '!':// <!doctype,<![CDATA,<!--
- locator&&position(tagStart);
- end = parseDCC(source,tagStart,domBuilder,errorHandler);
- break;
- default:
- locator&&position(tagStart);
- var el = new ElementAttributes();
- var currentNSMap = parseStack[parseStack.length-1].currentNSMap;
- //elStartEnd
- var end = parseElementStartPart(source,tagStart,el,currentNSMap,entityReplacer,errorHandler);
- var len = el.length;
-
-
- if(!el.closed && fixSelfClosed(source,end,el.tagName,closeMap)){
- el.closed = true;
- if(!entityMap.nbsp){
- errorHandler.warning('unclosed xml attribute');
- }
- }
- if(locator && len){
- var locator2 = copyLocator(locator,{});
- //try{//attribute position fixed
- for(var i = 0;i<len;i++){
- var a = el[i];
- position(a.offset);
- a.locator = copyLocator(locator,{});
- }
- //}catch(e){console.error('@@@@@'+e)}
- domBuilder.locator = locator2
- if(appendElement(el,domBuilder,currentNSMap)){
- parseStack.push(el)
- }
- domBuilder.locator = locator;
- }else{
- if(appendElement(el,domBuilder,currentNSMap)){
- parseStack.push(el)
- }
- }
-
-
-
- if(el.uri === 'http://www.w3.org/1999/xhtml' && !el.closed){
- end = parseHtmlSpecialContent(source,end,el.tagName,entityReplacer,domBuilder)
- }else{
- end++;
- }
- }
- }catch(e){
- errorHandler.error('element parse error: '+e)
- //errorHandler.error('element parse error: '+e);
- end = -1;
- //throw e;
- }
- if(end>start){
- start = end;
- }else{
- //TODO: 这里有可能sax回退,有位置错误风险
- appendText(Math.max(tagStart,start)+1);
- }
- }
- }
- function copyLocator(f,t){
- t.lineNumber = f.lineNumber;
- t.columnNumber = f.columnNumber;
- return t;
- }
- /**
- * @see #appendElement(source,elStartEnd,el,selfClosed,entityReplacer,domBuilder,parseStack);
- * @return end of the elementStartPart(end of elementEndPart for selfClosed el)
- */
- function parseElementStartPart(source,start,el,currentNSMap,entityReplacer,errorHandler){
- var attrName;
- var value;
- var p = ++start;
- var s = S_TAG;//status
- while(true){
- var c = source.charAt(p);
- switch(c){
- case '=':
- if(s === S_ATTR){//attrName
- attrName = source.slice(start,p);
- s = S_EQ;
- }else if(s === S_ATTR_SPACE){
- s = S_EQ;
- }else{
- //fatalError: equal must after attrName or space after attrName
- throw new Error('attribute equal must after attrName');
- }
- break;
- case '\'':
- case '"':
- if(s === S_EQ || s === S_ATTR //|| s == S_ATTR_SPACE
- ){//equal
- if(s === S_ATTR){
- errorHandler.warning('attribute value must after "="')
- attrName = source.slice(start,p)
- }
- start = p+1;
- p = source.indexOf(c,start)
- if(p>0){
- value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer);
- el.add(attrName,value,start-1);
- s = S_ATTR_END;
- }else{
- //fatalError: no end quot match
- throw new Error('attribute value no end \''+c+'\' match');
- }
- }else if(s == S_ATTR_NOQUOT_VALUE){
- value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer);
- //console.log(attrName,value,start,p)
- el.add(attrName,value,start);
- //console.dir(el)
- errorHandler.warning('attribute "'+attrName+'" missed start quot('+c+')!!');
- start = p+1;
- s = S_ATTR_END
- }else{
- //fatalError: no equal before
- throw new Error('attribute value must after "="');
- }
- break;
- case '/':
- switch(s){
- case S_TAG:
- el.setTagName(source.slice(start,p));
- case S_ATTR_END:
- case S_TAG_SPACE:
- case S_TAG_CLOSE:
- s =S_TAG_CLOSE;
- el.closed = true;
- case S_ATTR_NOQUOT_VALUE:
- case S_ATTR:
- case S_ATTR_SPACE:
- break;
- //case S_EQ:
- default:
- throw new Error("attribute invalid close char('/')")
- }
- break;
- case ''://end document
- //throw new Error('unexpected end of input')
- errorHandler.error('unexpected end of input');
- if(s == S_TAG){
- el.setTagName(source.slice(start,p));
- }
- return p;
- case '>':
- switch(s){
- case S_TAG:
- el.setTagName(source.slice(start,p));
- case S_ATTR_END:
- case S_TAG_SPACE:
- case S_TAG_CLOSE:
- break;//normal
- case S_ATTR_NOQUOT_VALUE://Compatible state
- case S_ATTR:
- value = source.slice(start,p);
- if(value.slice(-1) === '/'){
- el.closed = true;
- value = value.slice(0,-1)
- }
- case S_ATTR_SPACE:
- if(s === S_ATTR_SPACE){
- value = attrName;
- }
- if(s == S_ATTR_NOQUOT_VALUE){
- errorHandler.warning('attribute "'+value+'" missed quot(")!!');
- el.add(attrName,value.replace(/&#?\w+;/g,entityReplacer),start)
- }else{
- if(currentNSMap[''] !== 'http://www.w3.org/1999/xhtml' || !value.match(/^(?:disabled|checked|selected)$/i)){
- errorHandler.warning('attribute "'+value+'" missed value!! "'+value+'" instead!!')
- }
- el.add(value,value,start)
- }
- break;
- case S_EQ:
- throw new Error('attribute value missed!!');
- }
- // console.log(tagName,tagNamePattern,tagNamePattern.test(tagName))
- return p;
- /*xml space '\x20' | #x9 | #xD | #xA; */
- case '\u0080':
- c = ' ';
- default:
- if(c<= ' '){//space
- switch(s){
- case S_TAG:
- el.setTagName(source.slice(start,p));//tagName
- s = S_TAG_SPACE;
- break;
- case S_ATTR:
- attrName = source.slice(start,p)
- s = S_ATTR_SPACE;
- break;
- case S_ATTR_NOQUOT_VALUE:
- var value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer);
- errorHandler.warning('attribute "'+value+'" missed quot(")!!');
- el.add(attrName,value,start)
- case S_ATTR_END:
- s = S_TAG_SPACE;
- break;
- //case S_TAG_SPACE:
- //case S_EQ:
- //case S_ATTR_SPACE:
- // void();break;
- //case S_TAG_CLOSE:
- //ignore warning
- }
- }else{//not space
- //S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE
- //S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE
- switch(s){
- //case S_TAG:void();break;
- //case S_ATTR:void();break;
- //case S_ATTR_NOQUOT_VALUE:void();break;
- case S_ATTR_SPACE:
- var tagName = el.tagName;
- if(currentNSMap[''] !== 'http://www.w3.org/1999/xhtml' || !attrName.match(/^(?:disabled|checked|selected)$/i)){
- errorHandler.warning('attribute "'+attrName+'" missed value!! "'+attrName+'" instead2!!')
- }
- el.add(attrName,attrName,start);
- start = p;
- s = S_ATTR;
- break;
- case S_ATTR_END:
- errorHandler.warning('attribute space is required"'+attrName+'"!!')
- case S_TAG_SPACE:
- s = S_ATTR;
- start = p;
- break;
- case S_EQ:
- s = S_ATTR_NOQUOT_VALUE;
- start = p;
- break;
- case S_TAG_CLOSE:
- throw new Error("elements closed character '/' and '>' must be connected to");
- }
- }
- }//end outer switch
- //console.log('p++',p)
- p++;
- }
- }
- /**
- * @return true if has new namespace define
- */
- function appendElement(el,domBuilder,currentNSMap){
- var tagName = el.tagName;
- var localNSMap = null;
- //var currentNSMap = parseStack[parseStack.length-1].currentNSMap;
- var i = el.length;
- while(i--){
- var a = el[i];
- var qName = a.qName;
- var value = a.value;
- var nsp = qName.indexOf(':');
- if(nsp>0){
- var prefix = a.prefix = qName.slice(0,nsp);
- var localName = qName.slice(nsp+1);
- var nsPrefix = prefix === 'xmlns' && localName
- }else{
- localName = qName;
- prefix = null
- nsPrefix = qName === 'xmlns' && ''
- }
- //can not set prefix,because prefix !== ''
- a.localName = localName ;
- //prefix == null for no ns prefix attribute
- if(nsPrefix !== false){//hack!!
- if(localNSMap == null){
- localNSMap = {}
- //console.log(currentNSMap,0)
- _copy(currentNSMap,currentNSMap={})
- //console.log(currentNSMap,1)
- }
- currentNSMap[nsPrefix] = localNSMap[nsPrefix] = value;
- a.uri = 'http://www.w3.org/2000/xmlns/'
- domBuilder.startPrefixMapping(nsPrefix, value)
- }
- }
- var i = el.length;
- while(i--){
- a = el[i];
- var prefix = a.prefix;
- if(prefix){//no prefix attribute has no namespace
- if(prefix === 'xml'){
- a.uri = 'http://www.w3.org/XML/1998/namespace';
- }if(prefix !== 'xmlns'){
- a.uri = currentNSMap[prefix || '']
-
- //{console.log('###'+a.qName,domBuilder.locator.systemId+'',currentNSMap,a.uri)}
- }
- }
- }
- var nsp = tagName.indexOf(':');
- if(nsp>0){
- prefix = el.prefix = tagName.slice(0,nsp);
- localName = el.localName = tagName.slice(nsp+1);
- }else{
- prefix = null;//important!!
- localName = el.localName = tagName;
- }
- //no prefix element has default namespace
- var ns = el.uri = currentNSMap[prefix || ''];
- domBuilder.startElement(ns,localName,tagName,el);
- //endPrefixMapping and startPrefixMapping have not any help for dom builder
- //localNSMap = null
- if(el.closed){
- domBuilder.endElement(ns,localName,tagName);
- if(localNSMap){
- for(prefix in localNSMap){
- domBuilder.endPrefixMapping(prefix)
- }
- }
- }else{
- el.currentNSMap = currentNSMap;
- el.localNSMap = localNSMap;
- //parseStack.push(el);
- return true;
- }
- }
- function parseHtmlSpecialContent(source,elStartEnd,tagName,entityReplacer,domBuilder){
- if(/^(?:script|textarea)$/i.test(tagName)){
- var elEndStart = source.indexOf('</'+tagName+'>',elStartEnd);
- var text = source.substring(elStartEnd+1,elEndStart);
- if(/[&<]/.test(text)){
- if(/^script$/i.test(tagName)){
- //if(!/\]\]>/.test(text)){
- //lexHandler.startCDATA();
- domBuilder.characters(text,0,text.length);
- //lexHandler.endCDATA();
- return elEndStart;
- //}
- }//}else{//text area
- text = text.replace(/&#?\w+;/g,entityReplacer);
- domBuilder.characters(text,0,text.length);
- return elEndStart;
- //}
-
- }
- }
- return elStartEnd+1;
- }
- function fixSelfClosed(source,elStartEnd,tagName,closeMap){
- //if(tagName in closeMap){
- var pos = closeMap[tagName];
- if(pos == null){
- //console.log(tagName)
- pos = source.lastIndexOf('</'+tagName+'>')
- if(pos<elStartEnd){//忘记闭合
- pos = source.lastIndexOf('</'+tagName)
- }
- closeMap[tagName] =pos
- }
- return pos<elStartEnd;
- //}
- }
- function _copy(source,target){
- for(var n in source){target[n] = source[n]}
- }
- function parseDCC(source,start,domBuilder,errorHandler){//sure start with '<!'
- var next= source.charAt(start+2)
- switch(next){
- case '-':
- if(source.charAt(start + 3) === '-'){
- var end = source.indexOf('-->',start+4);
- //append comment source.substring(4,end)//<!--
- if(end>start){
- domBuilder.comment(source,start+4,end-start-4);
- return end+3;
- }else{
- errorHandler.error("Unclosed comment");
- return -1;
- }
- }else{
- //error
- return -1;
- }
- default:
- if(source.substr(start+3,6) == 'CDATA['){
- var end = source.indexOf(']]>',start+9);
- domBuilder.startCDATA();
- domBuilder.characters(source,start+9,end-start-9);
- domBuilder.endCDATA()
- return end+3;
- }
- //<!DOCTYPE
- //startDTD(java.lang.String name, java.lang.String publicId, java.lang.String systemId)
- var matchs = split(source,start);
- var len = matchs.length;
- if(len>1 && /!doctype/i.test(matchs[0][0])){
- var name = matchs[1][0];
- var pubid = len>3 && /^public$/i.test(matchs[2][0]) && matchs[3][0]
- var sysid = len>4 && matchs[4][0];
- var lastMatch = matchs[len-1]
- domBuilder.startDTD(name,pubid && pubid.replace(/^(['"])(.*?)\1$/,'$2'),
- sysid && sysid.replace(/^(['"])(.*?)\1$/,'$2'));
- domBuilder.endDTD();
-
- return lastMatch.index+lastMatch[0].length
- }
- }
- return -1;
- }
- function parseInstruction(source,start,domBuilder){
- var end = source.indexOf('?>',start);
- if(end){
- var match = source.substring(start,end).match(/^<\?(\S*)\s*([\s\S]*?)\s*$/);
- if(match){
- var len = match[0].length;
- domBuilder.processingInstruction(match[1], match[2]) ;
- return end+2;
- }else{//error
- return -1;
- }
- }
- return -1;
- }
- /**
- * @param source
- */
- function ElementAttributes(source){
-
- }
- ElementAttributes.prototype = {
- setTagName:function(tagName){
- if(!tagNamePattern.test(tagName)){
- throw new Error('invalid tagName:'+tagName)
- }
- this.tagName = tagName
- },
- add:function(qName,value,offset){
- if(!tagNamePattern.test(qName)){
- throw new Error('invalid attribute:'+qName)
- }
- this[this.length++] = {qName:qName,value:value,offset:offset}
- },
- length:0,
- getLocalName:function(i){return this[i].localName},
- getLocator:function(i){return this[i].locator},
- getQName:function(i){return this[i].qName},
- getURI:function(i){return this[i].uri},
- getValue:function(i){return this[i].value}
- // ,getIndex:function(uri, localName)){
- // if(localName){
- //
- // }else{
- // var qName = uri
- // }
- // },
- // getValue:function(){return this.getValue(this.getIndex.apply(this,arguments))},
- // getType:function(uri,localName){}
- // getType:function(i){},
- }
- function _set_proto_(thiz,parent){
- thiz.__proto__ = parent;
- return thiz;
- }
- if(!(_set_proto_({},_set_proto_.prototype) instanceof _set_proto_)){
- _set_proto_ = function(thiz,parent){
- function p(){};
- p.prototype = parent;
- p = new p();
- for(parent in thiz){
- p[parent] = thiz[parent];
- }
- return p;
- }
- }
- function split(source,start){
- var match;
- var buf = [];
- var reg = /'[^']+'|"[^"]+"|[^\s<>\/=]+=?|(\/?\s*>|<)/g;
- reg.lastIndex = start;
- reg.exec(source);//skip <
- while(match = reg.exec(source)){
- buf.push(match);
- if(match[1])return buf;
- }
- }
- exports.XMLReader = XMLReader;
|