platform/indexing-api/src/com/intellij/lang/cacheBuilder/DefaultWordsScanner.java - platform/tools/idea - Git at Google

 /*
  * Copyright 2000-2009 JetBrains s.r.o.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package com.intellij.lang.cacheBuilder;

 import com.intellij.lexer.Lexer;
 import com.intellij.psi.tree.IElementType;
 import com.intellij.psi.tree.TokenSet;
 import com.intellij.util.Processor;
 import org.jetbrains.annotations.NotNull;

 /**
  * The default implementation of a words scanner based on a custom language lexer.
  *
  * @author max
  */

 public class DefaultWordsScanner extends VersionedWordsScanner {
   private final Lexer myLexer;
   private final TokenSet myIdentifierTokenSet;
   private final TokenSet myCommentTokenSet;
   private final TokenSet myLiteralTokenSet;
   private final TokenSet mySkipCodeContextTokenSet;
   private boolean myMayHaveFileRefsInLiterals;

   /**
    * Creates a new instance of the words scanner.
    *
    * @param lexer              the lexer used for breaking the text into tokens.
    * @param identifierTokenSet the set of token types which represent identifiers.
    * @param commentTokenSet    the set of token types which represent comments.
    * @param literalTokenSet    the set of token types which represent literals.
    */
   public DefaultWordsScanner(final Lexer lexer, final TokenSet identifierTokenSet, final TokenSet commentTokenSet,
                              final TokenSet literalTokenSet) {
     this(lexer, identifierTokenSet, commentTokenSet, literalTokenSet, TokenSet.EMPTY);
   }

   /**
    * Creates a new instance of the words scanner.
    *
    * @param lexer              the lexer used for breaking the text into tokens.
    * @param identifierTokenSet the set of token types which represent identifiers.
    * @param commentTokenSet    the set of token types which represent comments.
    * @param literalTokenSet    the set of token types which represent literals.
    * @param skipCodeContextTokenSet the set of token types which should not be considered as code context.
    */
   public DefaultWordsScanner(final Lexer lexer, final TokenSet identifierTokenSet, final TokenSet commentTokenSet,
                              final TokenSet literalTokenSet, @NotNull TokenSet skipCodeContextTokenSet) {
     myLexer = lexer;
     myIdentifierTokenSet = identifierTokenSet;
     myCommentTokenSet = commentTokenSet;
     myLiteralTokenSet = literalTokenSet;
     mySkipCodeContextTokenSet = skipCodeContextTokenSet;
   }

   public void processWords(CharSequence fileText, Processor<WordOccurrence> processor) {
     myLexer.start(fileText);
     WordOccurrence occurrence = new WordOccurrence(fileText, 0, 0, null); // shared occurrence

     IElementType type;
     while ((type = myLexer.getTokenType()) != null) {
       if (myIdentifierTokenSet.contains(type)) {
         //occurrence.init(fileText, myLexer.getTokenStart(), myLexer.getTokenEnd(), WordOccurrence.Kind.CODE);
         //if (!processor.process(occurrence)) return;
         if (!stripWords(processor, fileText, myLexer.getTokenStart(), myLexer.getTokenEnd(), WordOccurrence.Kind.CODE, occurrence, false)) return;      }
       else if (myCommentTokenSet.contains(type)) {
         if (!stripWords(processor, fileText,myLexer.getTokenStart(),myLexer.getTokenEnd(), WordOccurrence.Kind.COMMENTS,occurrence, false)) return;
       }
       else if (myLiteralTokenSet.contains(type)) {
         if (!stripWords(processor, fileText, myLexer.getTokenStart(),myLexer.getTokenEnd(),WordOccurrence.Kind.LITERALS,occurrence, myMayHaveFileRefsInLiterals)) return;
       }
       else if (!mySkipCodeContextTokenSet.contains(type)) {
         if (!stripWords(processor, fileText, myLexer.getTokenStart(), myLexer.getTokenEnd(), WordOccurrence.Kind.CODE, occurrence, false)) return;
       }
       myLexer.advance();
     }
   }

   protected static boolean stripWords(final Processor<WordOccurrence> processor,
                                     final CharSequence tokenText,
                                     int from,
                                     int to,
                                     final WordOccurrence.Kind kind,
                                     @NotNull WordOccurrence occurrence,
                                     boolean mayHaveFileRefs
   ) {
     // This code seems strange but it is more effective as Character.isJavaIdentifier_xxx_ is quite costly operation due to unicode
     int index = from;

     ScanWordsLoop:
     while (true) {
       while (true) {
         if (index == to) break ScanWordsLoop;
         char c = tokenText.charAt(index);
         if (isAsciiIdentifierPart(c) || Character.isJavaIdentifierStart(c)) {
           break;
         }
         index++;
       }
       int wordStart = index;
       while (true) {
         index++;
         if (index == to) break;
         char c = tokenText.charAt(index);
         if (isAsciiIdentifierPart(c)) continue;
         if (!Character.isJavaIdentifierPart(c)) break;
       }
       int wordEnd = index;
       occurrence.init(tokenText, wordStart, wordEnd, kind);

       if (!processor.process(occurrence)) return false;

       if (mayHaveFileRefs) {
         occurrence.init(tokenText, wordStart, wordEnd, WordOccurrence.Kind.FOREIGN_LANGUAGE);
         if (!processor.process(occurrence)) return false;
       }
     }
     return true;
   }

   private static boolean isAsciiIdentifierPart(char c) {
     return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '$';
   }

   public void setMayHaveFileRefsInLiterals(final boolean mayHaveFileRefsInLiterals) {
     myMayHaveFileRefsInLiterals = mayHaveFileRefsInLiterals;
   }
 }
	/*
	* Copyright 2000-2009 JetBrains s.r.o.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package com.intellij.lang.cacheBuilder;

	import com.intellij.lexer.Lexer;
	import com.intellij.psi.tree.IElementType;
	import com.intellij.psi.tree.TokenSet;
	import com.intellij.util.Processor;
	import org.jetbrains.annotations.NotNull;

	/**
	* The default implementation of a words scanner based on a custom language lexer.
	*
	* @author max
	*/

	public class DefaultWordsScanner extends VersionedWordsScanner {
	private final Lexer myLexer;
	private final TokenSet myIdentifierTokenSet;
	private final TokenSet myCommentTokenSet;
	private final TokenSet myLiteralTokenSet;
	private final TokenSet mySkipCodeContextTokenSet;
	private boolean myMayHaveFileRefsInLiterals;

	/**
	* Creates a new instance of the words scanner.
	*
	* @param lexer the lexer used for breaking the text into tokens.
	* @param identifierTokenSet the set of token types which represent identifiers.
	* @param commentTokenSet the set of token types which represent comments.
	* @param literalTokenSet the set of token types which represent literals.
	*/
	public DefaultWordsScanner(final Lexer lexer, final TokenSet identifierTokenSet, final TokenSet commentTokenSet,
	final TokenSet literalTokenSet) {
	this(lexer, identifierTokenSet, commentTokenSet, literalTokenSet, TokenSet.EMPTY);
	}

	/**
	* Creates a new instance of the words scanner.
	*
	* @param lexer the lexer used for breaking the text into tokens.
	* @param identifierTokenSet the set of token types which represent identifiers.
	* @param commentTokenSet the set of token types which represent comments.
	* @param literalTokenSet the set of token types which represent literals.
	* @param skipCodeContextTokenSet the set of token types which should not be considered as code context.
	*/
	public DefaultWordsScanner(final Lexer lexer, final TokenSet identifierTokenSet, final TokenSet commentTokenSet,
	final TokenSet literalTokenSet, @NotNull TokenSet skipCodeContextTokenSet) {
	myLexer = lexer;
	myIdentifierTokenSet = identifierTokenSet;
	myCommentTokenSet = commentTokenSet;
	myLiteralTokenSet = literalTokenSet;
	mySkipCodeContextTokenSet = skipCodeContextTokenSet;
	}

	public void processWords(CharSequence fileText, Processor<WordOccurrence> processor) {
	myLexer.start(fileText);
	WordOccurrence occurrence = new WordOccurrence(fileText, 0, 0, null); // shared occurrence

	IElementType type;
	while ((type = myLexer.getTokenType()) != null) {
	if (myIdentifierTokenSet.contains(type)) {
	//occurrence.init(fileText, myLexer.getTokenStart(), myLexer.getTokenEnd(), WordOccurrence.Kind.CODE);
	//if (!processor.process(occurrence)) return;
	if (!stripWords(processor, fileText, myLexer.getTokenStart(), myLexer.getTokenEnd(), WordOccurrence.Kind.CODE, occurrence, false)) return; }
	else if (myCommentTokenSet.contains(type)) {
	if (!stripWords(processor, fileText,myLexer.getTokenStart(),myLexer.getTokenEnd(), WordOccurrence.Kind.COMMENTS,occurrence, false)) return;
	}
	else if (myLiteralTokenSet.contains(type)) {
	if (!stripWords(processor, fileText, myLexer.getTokenStart(),myLexer.getTokenEnd(),WordOccurrence.Kind.LITERALS,occurrence, myMayHaveFileRefsInLiterals)) return;
	}
	else if (!mySkipCodeContextTokenSet.contains(type)) {
	if (!stripWords(processor, fileText, myLexer.getTokenStart(), myLexer.getTokenEnd(), WordOccurrence.Kind.CODE, occurrence, false)) return;
	}
	myLexer.advance();
	}
	}

	protected static boolean stripWords(final Processor<WordOccurrence> processor,
	final CharSequence tokenText,
	int from,
	int to,
	final WordOccurrence.Kind kind,
	@NotNull WordOccurrence occurrence,
	boolean mayHaveFileRefs
	) {
	// This code seems strange but it is more effective as Character.isJavaIdentifier_xxx_ is quite costly operation due to unicode
	int index = from;

	ScanWordsLoop:
	while (true) {
	while (true) {
	if (index == to) break ScanWordsLoop;
	char c = tokenText.charAt(index);
	if (isAsciiIdentifierPart(c) \|\| Character.isJavaIdentifierStart(c)) {
	break;
	}
	index++;
	}
	int wordStart = index;
	while (true) {
	index++;
	if (index == to) break;
	char c = tokenText.charAt(index);
	if (isAsciiIdentifierPart(c)) continue;
	if (!Character.isJavaIdentifierPart(c)) break;
	}
	int wordEnd = index;
	occurrence.init(tokenText, wordStart, wordEnd, kind);

	if (!processor.process(occurrence)) return false;

	if (mayHaveFileRefs) {
	occurrence.init(tokenText, wordStart, wordEnd, WordOccurrence.Kind.FOREIGN_LANGUAGE);
	if (!processor.process(occurrence)) return false;
	}
	}
	return true;
	}

	private static boolean isAsciiIdentifierPart(char c) {
	return (c >= 'a' && c <= 'z') \|\| (c >= 'A' && c <= 'Z') \|\| (c >= '0' && c <= '9') \|\| c == '$';
	}

	public void setMayHaveFileRefsInLiterals(final boolean mayHaveFileRefsInLiterals) {
	myMayHaveFileRefsInLiterals = mayHaveFileRefsInLiterals;
	}
	}