@@ -140,7 +140,7 @@ function pr_isIE6() {
140140 "BEGIN END " ;
141141 var SH_KEYWORDS = "break case continue do done elif else esac eval fi for " +
142142 "function if in local set then until while " ;
143- var ALL_KEYWORD_SET = wordSet (
143+ var ALL_KEYWORDS = (
144144 CPP_KEYWORDS + CSHARP_KEYWORDS + JSCRIPT_KEYWORDS + PERL_KEYWORDS +
145145 PYTHON_KEYWORDS + RUBY_KEYWORDS + SH_KEYWORDS ) ;
146146
@@ -270,6 +270,7 @@ function pr_isIE6() {
270270 var pr_aposEnt = / & a p o s ; / g;
271271 var pr_quotEnt = / & q u o t ; / g;
272272 var pr_ampEnt = / & a m p ; / g;
273+ var pr_nbspEnt = / & n b s p ; / g;
273274 /** unescapes html to plain text. */
274275 function htmlToText ( html ) {
275276 var pos = html . indexOf ( '&' ) ;
@@ -298,7 +299,8 @@ function pr_isIE6() {
298299 . replace ( pr_gtEnt , '>' )
299300 . replace ( pr_aposEnt , "'" )
300301 . replace ( pr_quotEnt , '"' )
301- . replace ( pr_ampEnt , '&' ) ;
302+ . replace ( pr_ampEnt , '&' )
303+ . replace ( pr_nbspEnt , ' ' ) ;
302304 }
303305
304306 /** is the given node's innerHTML normally unescaped? */
@@ -333,7 +335,7 @@ function pr_isIE6() {
333335 break ;
334336 }
335337 }
336-
338+
337339 var PR_innerHtmlWorks = null ;
338340 function getInnerHtml ( node ) {
339341 // inner html is hopelessly broken in Safari 2.0.4 when the content is
@@ -497,7 +499,7 @@ function pr_isIE6() {
497499 * function that takes source code and returns a list of decorations.
498500 */
499501 function createSimpleLexer ( shortcutStylePatterns ,
500- fallthroughStylePatterns ) {
502+ fallthroughStylePatterns ) {
501503 var shortcuts = { } ;
502504 ( function ( ) {
503505 var allPatterns = shortcutStylePatterns . concat ( fallthroughStylePatterns ) ;
@@ -562,76 +564,6 @@ function pr_isIE6() {
562564 } ;
563565 }
564566
565- var PR_C_STYLE_STRING_AND_COMMENT_LEXER = createSimpleLexer ( [
566- [ PR_STRING , / ^ \' (?: [ ^ \\ \' ] | \\ [ \s \S ] ) * (?: \' | $ ) / , null , "'" ] ,
567- [ PR_STRING , / ^ \" (?: [ ^ \\ \" ] | \\ [ \s \S ] ) * (?: \" | $ ) / , null , '"' ] ,
568- [ PR_STRING , / ^ \` (?: [ ^ \\ \` ] | \\ [ \s \S ] ) * (?: \` | $ ) / , null , '`' ]
569- ] , [
570- [ PR_PLAIN , / ^ (?: [ ^ \' \" \` \/ \# ] + ) / , null , ' \r\n' ] ,
571- [ PR_COMMENT , / ^ # [ ^ \r \n ] * / , null , '#' ] ,
572- [ PR_COMMENT , / ^ \/ \/ [ ^ \r \n ] * / , null ] ,
573- [ PR_STRING , / ^ \/ (?: [ ^ \\ \* \/ ] | \\ [ \s \S ] ) + (?: \/ | $ ) / ,
574- REGEXP_PRECEDER_PATTERN ] ,
575- [ PR_COMMENT , / ^ \/ \* [ \s \S ] * ?(?: \* \/ | $ ) / , null ]
576- ] ) ;
577- /** splits the given string into comment, string, and "other" tokens.
578- * @param {string } sourceCode as plain text
579- * @return {Array.<number|string> } a decoration list.
580- * @private
581- */
582- function splitStringAndCommentTokens ( sourceCode ) {
583- return PR_C_STYLE_STRING_AND_COMMENT_LEXER ( sourceCode ) ;
584- }
585-
586- var PR_C_STYLE_LITERAL_IDENTIFIER_PUNC_RECOGNIZER = createSimpleLexer ( [ ] , [
587- [ PR_PLAIN , / ^ \s + / , null , ' \r\n' ] ,
588- // TODO(mikesamuel): recognize non-latin letters and numerals in idents
589- [ PR_PLAIN , / ^ [ a - z _ $ @ ] [ a - z _ $ @ 0 - 9 ] * / i, null ] ,
590- // A hex number
591- [ PR_LITERAL , / ^ 0 x [ a - f 0 - 9 ] + [ a - z ] / i, null ] ,
592- // An octal or decimal number, possibly in scientific notation
593- [ PR_LITERAL , / ^ (?: \d (?: _ \d + ) * \d * (?: \. \d * ) ? | \. \d + ) (?: e [ + \- ] ? \d + ) ? [ a - z ] * / i,
594- null , '123456789' ] ,
595- [ PR_PUNCTUATION , / ^ [ ^ \s \w \. $ @ ] + / , null ]
596- // Fallback will handle decimal points not adjacent to a digit
597- ] ) ;
598-
599- /** splits plain text tokens into more specific tokens, and then tries to
600- * recognize keywords, and types.
601- * @private
602- */
603- function splitNonStringNonCommentTokens ( source , decorations ) {
604- for ( var i = 0 ; i < decorations . length ; i += 2 ) {
605- var style = decorations [ i + 1 ] ;
606- if ( style === PR_PLAIN ) {
607- var start , end , chunk , subDecs ;
608- start = decorations [ i ] ;
609- end = i + 2 < decorations . length ? decorations [ i + 2 ] : source . length ;
610- chunk = source . substring ( start , end ) ;
611- subDecs = PR_C_STYLE_LITERAL_IDENTIFIER_PUNC_RECOGNIZER ( chunk , start ) ;
612- for ( var j = 0 , m = subDecs . length ; j < m ; j += 2 ) {
613- var subStyle = subDecs [ j + 1 ] ;
614- if ( subStyle === PR_PLAIN ) {
615- var subStart = subDecs [ j ] ;
616- var subEnd = j + 2 < m ? subDecs [ j + 2 ] : chunk . length ;
617- var token = source . substring ( subStart , subEnd ) ;
618- if ( token === '.' ) {
619- subDecs [ j + 1 ] = PR_PUNCTUATION ;
620- } else if ( token in ALL_KEYWORD_SET ) {
621- subDecs [ j + 1 ] = PR_KEYWORD ;
622- } else if ( / ^ @ ? [ A - Z ] [ A - Z $ ] * [ a - z ] [ A - Z a - z $ ] * $ / . test ( token ) ) {
623- // classify types and annotations using Java's style conventions
624- subDecs [ j + 1 ] = token . charAt ( 0 ) === '@' ? PR_LITERAL : PR_TYPE ;
625- }
626- }
627- }
628- spliceArrayInto ( subDecs , decorations , i , 2 ) ;
629- i += subDecs . length - 2 ;
630- }
631- }
632- return decorations ;
633- }
634-
635567 var PR_MARKUP_LEXER = createSimpleLexer ( [ ] , [
636568 [ PR_PLAIN , / ^ [ ^ < ] + / , null ] ,
637569 [ PR_DECLARATION , / ^ < ! \w [ ^ > ] * (?: > | $ ) / , null ] ,
@@ -704,7 +636,7 @@ function pr_isIE6() {
704636 return decorations ;
705637 }
706638
707- /** returns a list of decorations, where even entries
639+ /** returns a function that produces a list of decorations from source text.
708640 *
709641 * This code treats ", ', and ` as string delimiters, and \ as a string
710642 * escape. It does not recognize perl's qq() style strings.
@@ -715,30 +647,130 @@ function pr_isIE6() {
715647 *
716648 * It recognizes C, C++, and shell style comments.
717649 *
718- * @param {string } sourceCode as plain text
719- * @return {Array.<string|number> } a decoration list
650+ * @param {Object } options a set of optional parameters.
651+ * @return {function (sourceCode : string) : Array.<string|number> } a
652+ * decorator that takes sourceCode as plain text and that returns a
653+ * decoration list
720654 */
721- function decorateSource ( sourceCode ) {
722- // Split into strings, comments, and other.
723- // We do this because strings and comments are easily recognizable and can
724- // contain stuff that looks like other tokens, so we want to mark those
725- // early so we don't recurse into them.
726- var decorations = splitStringAndCommentTokens ( sourceCode ) ;
655+ function sourceDecorator ( options ) {
656+ var shortcutStylePatterns = [ ] , fallthroughStylePatterns = [ ] ;
657+ if ( options . tripleQuotedStrings ) {
658+ shortcutStylePatterns . push (
659+ [ PR_STRING , / ^ (?: \' \' \' (?: [ ^ \' \\ ] | \\ [ \s \S ] | \' { 1 , 2 } (? = [ ^ \' ] ) ) * (?: \' \' \' | $ ) | \" \" \" (?: [ ^ \" \\ ] | \\ [ \s \S ] | \" { 1 , 2 } (? = [ ^ \" ] ) ) * (?: \" \" \" | $ ) | \' (?: [ ^ \\ \' ] | \\ [ \s \S ] ) * (?: \' | $ ) | \" (?: [ ^ \\ \" ] | \\ [ \s \S ] ) * (?: \" | $ ) ) / ,
660+ null , '\'"' ] ) ;
661+ } else if ( options . multiLineStrings ) {
662+ shortcutStylePatterns . push (
663+ [ PR_STRING , / ^ (?: \' (?: [ ^ \\ \' ] | \\ [ \s \S ] ) * (?: \' | $ ) | \" (?: [ ^ \\ \" ] | \\ [ \s \S ] ) * (?: \" | $ ) | \` (?: [ ^ \\ \` ] | \\ [ \s \S ] ) * (?: \` | $ ) ) / ,
664+ null , '\'"`' ] ) ;
665+ } else {
666+ shortcutStylePatterns . push (
667+ [ PR_STRING ,
668+ / ^ (?: \' (?: [ ^ \\ \' \r \n ] | \\ .) * (?: \' | $ ) | \" (?: [ ^ \\ \" \r \n ] | \\ .) * (?: \" | $ ) ) / ,
669+ null , '"\'' ] ) ;
670+ }
671+ fallthroughStylePatterns . push (
672+ [ PR_PLAIN , / ^ (?: [ ^ \' \" \` \/ \# ] + ) / , null , ' \r\n' ] ) ;
673+ if ( options . hashComments ) {
674+ shortcutStylePatterns . push ( [ PR_COMMENT , / ^ # [ ^ \r \n ] * / , null , '#' ] ) ;
675+ }
676+ if ( options . cStyleComments ) {
677+ fallthroughStylePatterns . push ( [ PR_COMMENT , / ^ \/ \/ [ ^ \r \n ] * / , null ] ) ;
678+ }
679+ if ( options . regexLiterals ) {
680+ fallthroughStylePatterns . push (
681+ [ PR_STRING ,
682+ / ^ \/ (?: [ ^ \\ \* \/ \[ ] | \\ [ \s \S ] | \[ (?: [ ^ \] \\ ] | \\ .) * (?: \] | $ ) ) + (?: \/ | $ ) / ,
683+ REGEXP_PRECEDER_PATTERN ] ) ;
684+ }
685+ if ( options . cStyleComments ) {
686+ fallthroughStylePatterns . push (
687+ [ PR_COMMENT , / ^ \/ \* [ \s \S ] * ?(?: \* \/ | $ ) / , null ] ) ;
688+ }
727689
728- // Split non comment|string tokens on whitespace and word boundaries
729- decorations = splitNonStringNonCommentTokens ( sourceCode , decorations ) ;
690+ var keywords = wordSet ( options . keywords ) ;
691+
692+ options = null ;
693+
694+ /** splits the given string into comment, string, and "other" tokens.
695+ * @param {string } sourceCode as plain text
696+ * @return {Array.<number|string> } a decoration list.
697+ * @private
698+ */
699+ var splitStringAndCommentTokens = createSimpleLexer (
700+ shortcutStylePatterns , fallthroughStylePatterns ) ;
701+
702+ var styleLiteralIdentifierPuncRecognizer = createSimpleLexer ( [ ] , [
703+ [ PR_PLAIN , / ^ \s + / , null , ' \r\n' ] ,
704+ // TODO(mikesamuel): recognize non-latin letters and numerals in idents
705+ [ PR_PLAIN , / ^ [ a - z _ $ @ ] [ a - z _ $ @ 0 - 9 ] * / i, null ] ,
706+ // A hex number
707+ [ PR_LITERAL , / ^ 0 x [ a - f 0 - 9 ] + [ a - z ] / i, null ] ,
708+ // An octal or decimal number, possibly in scientific notation
709+ [ PR_LITERAL ,
710+ / ^ (?: \d (?: _ \d + ) * \d * (?: \. \d * ) ? | \. \d + ) (?: e [ + \- ] ? \d + ) ? [ a - z ] * / i,
711+ null , '123456789' ] ,
712+ [ PR_PUNCTUATION , / ^ [ ^ \s \w \. $ @ ] + / , null ]
713+ // Fallback will handle decimal points not adjacent to a digit
714+ ] ) ;
730715
731- return decorations ;
732- }
716+ /** splits plain text tokens into more specific tokens, and then tries to
717+ * recognize keywords, and types.
718+ * @private
719+ */
720+ function splitNonStringNonCommentTokens ( source , decorations ) {
721+ for ( var i = 0 ; i < decorations . length ; i += 2 ) {
722+ var style = decorations [ i + 1 ] ;
723+ if ( style === PR_PLAIN ) {
724+ var start , end , chunk , subDecs ;
725+ start = decorations [ i ] ;
726+ end = i + 2 < decorations . length ? decorations [ i + 2 ] : source . length ;
727+ chunk = source . substring ( start , end ) ;
728+ subDecs = styleLiteralIdentifierPuncRecognizer ( chunk , start ) ;
729+ for ( var j = 0 , m = subDecs . length ; j < m ; j += 2 ) {
730+ var subStyle = subDecs [ j + 1 ] ;
731+ if ( subStyle === PR_PLAIN ) {
732+ var subStart = subDecs [ j ] ;
733+ var subEnd = j + 2 < m ? subDecs [ j + 2 ] : chunk . length ;
734+ var token = source . substring ( subStart , subEnd ) ;
735+ if ( token === '.' ) {
736+ subDecs [ j + 1 ] = PR_PUNCTUATION ;
737+ } else if ( token in keywords ) {
738+ subDecs [ j + 1 ] = PR_KEYWORD ;
739+ } else if ( / ^ @ ? [ A - Z ] [ A - Z $ ] * [ a - z ] [ A - Z a - z $ ] * $ / . test ( token ) ) {
740+ // classify types and annotations using Java's style conventions
741+ subDecs [ j + 1 ] = token . charAt ( 0 ) === '@' ? PR_LITERAL : PR_TYPE ;
742+ }
743+ }
744+ }
745+ spliceArrayInto ( subDecs , decorations , i , 2 ) ;
746+ i += subDecs . length - 2 ;
747+ }
748+ }
749+ return decorations ;
750+ }
733751
734- function cSourceDecorator ( keywords , opt_options ) {
735- return decorateSource ; // TODO: implement me
736- }
752+ return function ( sourceCode ) {
753+ // Split into strings, comments, and other.
754+ // We do this because strings and comments are easily recognizable and can
755+ // contain stuff that looks like other tokens, so we want to mark those
756+ // early so we don't recurse into them.
757+ var decorations = splitStringAndCommentTokens ( sourceCode ) ;
758+
759+ // Split non comment|string tokens on whitespace and word boundaries
760+ decorations = splitNonStringNonCommentTokens ( sourceCode , decorations ) ;
737761
738- function shellSourceDecorator ( keywords , opt_options ) {
739- return decorateSource ; // TODO: implement me
762+ return decorations ;
763+ } ;
740764 }
741765
766+ var decorateSource = sourceDecorator ( {
767+ keywords : ALL_KEYWORDS ,
768+ hashComments : true ,
769+ cStyleComments : true ,
770+ multiLineStrings : true ,
771+ regexLiterals : true
772+ } ) ;
773+
742774 /** identify regions of markup that are really source code, and recursivley
743775 * lex them.
744776 * @private
@@ -958,22 +990,44 @@ function pr_isIE6() {
958990 }
959991 registerLangHandler ( decorateSource , [ 'default-code' ] ) ;
960992 registerLangHandler ( decorateMarkup ,
961- [ 'default-markup' , 'html' , 'htm' , 'xhtml' , 'xml' ] ) ;
962- registerLangHandler ( cSourceDecorator ( CPP_KEYWORDS ) ,
963- [ 'c' , 'cc' , 'cpp' , 'cs' , 'cxx' , 'cyc' ] ) ;
964- registerLangHandler ( cSourceDecorator ( JAVA_KEYWORDS ) , [ 'java' ] ) ;
965- registerLangHandler ( shellSourceDecorator ( SH_KEYWORDS ) , [ 'csh' , 'sh' ] ) ;
966- registerLangHandler (
967- shellSourceDecorator ( PYTHON_KEYWORDS ) , [ 'cv' , 'py' ] ,
968- { tripleQuotedStrings : true } ) ;
969- registerLangHandler (
970- shellSourceDecorator ( PERL_KEYWORDS ,
971- { regexLiteral : true , multiLineStrings : true } ) , [ 'pl' ] ) ;
972- registerLangHandler (
973- shellSourceDecorator ( RUBY_KEYWORDS ,
974- { regexLiteral : true , multiLineStrings : true } ) , [ 'rb' ] ) ;
975- registerLangHandler (
976- cSourceDecorator ( JSCRIPT_KEYWORDS , { regexLiteral : true } ) , [ 'js' ] ) ;
993+ [ 'default-markup' , 'html' , 'htm' , 'xhtml' , 'xml' , 'xsl' ] ) ;
994+ registerLangHandler ( sourceDecorator ( {
995+ keywords : CPP_KEYWORDS ,
996+ hashComments : true ,
997+ cStyleComments : true
998+ } ) , [ 'c' , 'cc' , 'cpp' , 'cs' , 'cxx' , 'cyc' ] ) ;
999+ registerLangHandler ( sourceDecorator ( {
1000+ keywords : JAVA_KEYWORDS ,
1001+ cStyleComments : true
1002+ } ) , [ 'java' ] ) ;
1003+ registerLangHandler ( sourceDecorator ( {
1004+ keywords : SH_KEYWORDS ,
1005+ hashComments : true ,
1006+ multiLineStrings : true
1007+ } ) , [ 'bsh' , 'csh' , 'sh' ] ) ;
1008+ registerLangHandler ( sourceDecorator ( {
1009+ keywords : PYTHON_KEYWORDS ,
1010+ hashComments : true ,
1011+ multiLineStrings : true ,
1012+ tripleQuotedStrings : true
1013+ } ) , [ 'cv' , 'py' ] ) ;
1014+ registerLangHandler ( sourceDecorator ( {
1015+ keywords : PERL_KEYWORDS ,
1016+ hashComments : true ,
1017+ multiLineStrings : true ,
1018+ regexLiterals : true
1019+ } ) , [ 'perl' , 'pl' , 'pm' ] ) ;
1020+ registerLangHandler ( sourceDecorator ( {
1021+ keywords : RUBY_KEYWORDS ,
1022+ hashComments : true ,
1023+ multiLineStrings : true ,
1024+ regexLiterals : true
1025+ } ) , [ 'rb' ] ) ;
1026+ registerLangHandler ( sourceDecorator ( {
1027+ keywords : JSCRIPT_KEYWORDS ,
1028+ cStyleComments : true ,
1029+ regexLiterals : true
1030+ } ) , [ 'js' ] ) ;
9771031
9781032 function prettyPrintOne ( sourceCodeHtml , opt_langExtension ) {
9791033 try {
0 commit comments