spellchecker/src/com/intellij/spellchecker/inspections/IdentifierSplitter.java - platform/tools/idea - Git at Google

 /*
  * Copyright 2000-2010 JetBrains s.r.o.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package com.intellij.spellchecker.inspections;

 import com.intellij.openapi.util.TextRange;
 import com.intellij.spellchecker.util.Strings;
 import com.intellij.util.Consumer;
 import org.jetbrains.annotations.NonNls;
 import org.jetbrains.annotations.NotNull;
 import org.jetbrains.annotations.Nullable;

 import java.util.ArrayList;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;


 public class IdentifierSplitter extends BaseSplitter {
   private static final IdentifierSplitter INSTANCE = new IdentifierSplitter();

   public static IdentifierSplitter getInstance() {
     return INSTANCE;
   }

   @NonNls
   private static final Pattern WORD = Pattern.compile("\\b\\p{L}*'?\\p{L}*");


   @NonNls
   private static final Pattern WORD_IN_QUOTES = Pattern.compile("'([^']*)'");

   @Override
   public void split(@Nullable String text, @NotNull TextRange range, Consumer<TextRange> consumer) {
     if (text == null || range.getLength() < 1 || range.getStartOffset() < 0) {
       return;
     }

     List<TextRange> extracted = excludeByPattern(text, range, WORD_IN_QUOTES, 1);

     for (TextRange textRange : extracted) {
       List<TextRange> words = splitByCase(text, textRange);

       if (words.size() == 0) {
         continue;
       }

       if (words.size() == 1) {
         addWord(consumer, false, words.get(0));
         continue;
       }

       boolean isCapitalized = Strings.isCapitalized(text, words.get(0));
       boolean containsShortWord = containsShortWord(words);

       if (isCapitalized && containsShortWord) {
         continue;
       }

       boolean isAllWordsAreUpperCased = isAllWordsAreUpperCased(text, words);

       for (TextRange word : words) {
         boolean uc = Strings.isUpperCased(text, word);
         boolean flag = (uc && !isAllWordsAreUpperCased);
         Matcher matcher = WORD.matcher(text.substring(word.getStartOffset(), word.getEndOffset()));
         if (matcher.find()) {
           TextRange found = matcherRange(word, matcher);
           addWord(consumer, flag, found);
         }
       }
     }
   }

   @NotNull
   private static List<TextRange> splitByCase(@NotNull String text, @NotNull TextRange range) {
     //System.out.println("text = " + text + " range = " + range);
     List<TextRange> result = new ArrayList<TextRange>();
     int i = range.getStartOffset();
     int s = -1;
     int prevType = Character.MATH_SYMBOL;
     while (i < range.getEndOffset()) {
       final char ch = text.charAt(i);
       if (ch >= '\u3040' && ch <= '\u309f' || // Hiragana
           ch >= '\u30A0' && ch <= '\u30ff' || // Katakana
           ch >= '\u4E00' && ch <= '\u9FFF' || // CJK Unified ideographs
           ch >= '\uF900' && ch <= '\uFAFF' || // CJK Compatibility Ideographs
           ch >= '\uFF00' && ch <= '\uFFEF' //Halfwidth and Fullwidth Forms of Katakana & Fullwidth ASCII variants
          ) {
         if (s >= 0) {
           add(text, result, i, s);
           s = -1;
         }
         prevType = Character.MATH_SYMBOL;
         ++i;
         continue;
       }

       final int type = Character.getType(ch);
       if (type == Character.LOWERCASE_LETTER ||
           type == Character.UPPERCASE_LETTER ||
           type == Character.TITLECASE_LETTER ||
           type == Character.OTHER_LETTER ||
           type == Character.MODIFIER_LETTER ||
           type == Character.OTHER_PUNCTUATION
         ) {
         //letter
         if (s < 0) {
           //start
           s = i;
         }
         else if (s >= 0 && type == Character.UPPERCASE_LETTER && prevType == Character.LOWERCASE_LETTER) {
           //a|Camel
           add(text, result, i, s);
           s = i;
         }
         else if (i - s >= 1 && type == Character.LOWERCASE_LETTER && prevType == Character.UPPERCASE_LETTER) {
           //CAPITALN|ext
           add(text, result, i - 1, s);
           s = i - 1;
         }
       }
       else if (s >= 0) {
         //non-letter
         add(text, result, i, s);
         s = -1;
       }
       prevType = type;
       i++;
     }
     //remainder
     if (s >= 0) {
       add(text, result, i, s);
     }
     return result;
   }

   private static void add(String text, List<TextRange> result, int i, int s) {
     if (i - s > 3) {
       final TextRange textRange = new TextRange(s, i);
       //System.out.println("textRange = " + textRange + " = "+ textRange.substring(text));
       result.add(textRange);
     }
   }
 }
	/*
	* Copyright 2000-2010 JetBrains s.r.o.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package com.intellij.spellchecker.inspections;

	import com.intellij.openapi.util.TextRange;
	import com.intellij.spellchecker.util.Strings;
	import com.intellij.util.Consumer;
	import org.jetbrains.annotations.NonNls;
	import org.jetbrains.annotations.NotNull;
	import org.jetbrains.annotations.Nullable;

	import java.util.ArrayList;
	import java.util.List;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;


	public class IdentifierSplitter extends BaseSplitter {
	private static final IdentifierSplitter INSTANCE = new IdentifierSplitter();

	public static IdentifierSplitter getInstance() {
	return INSTANCE;
	}

	@NonNls
	private static final Pattern WORD = Pattern.compile("\\b\\p{L}'?\\p{L}");


	@NonNls
	private static final Pattern WORD_IN_QUOTES = Pattern.compile("'([^']*)'");

	@Override
	public void split(@Nullable String text, @NotNull TextRange range, Consumer<TextRange> consumer) {
	if (text == null \|\| range.getLength() < 1 \|\| range.getStartOffset() < 0) {
	return;
	}

	List<TextRange> extracted = excludeByPattern(text, range, WORD_IN_QUOTES, 1);

	for (TextRange textRange : extracted) {
	List<TextRange> words = splitByCase(text, textRange);

	if (words.size() == 0) {
	continue;
	}

	if (words.size() == 1) {
	addWord(consumer, false, words.get(0));
	continue;
	}

	boolean isCapitalized = Strings.isCapitalized(text, words.get(0));
	boolean containsShortWord = containsShortWord(words);

	if (isCapitalized && containsShortWord) {
	continue;
	}

	boolean isAllWordsAreUpperCased = isAllWordsAreUpperCased(text, words);

	for (TextRange word : words) {
	boolean uc = Strings.isUpperCased(text, word);
	boolean flag = (uc && !isAllWordsAreUpperCased);
	Matcher matcher = WORD.matcher(text.substring(word.getStartOffset(), word.getEndOffset()));
	if (matcher.find()) {
	TextRange found = matcherRange(word, matcher);
	addWord(consumer, flag, found);
	}
	}
	}
	}

	@NotNull
	private static List<TextRange> splitByCase(@NotNull String text, @NotNull TextRange range) {
	//System.out.println("text = " + text + " range = " + range);
	List<TextRange> result = new ArrayList<TextRange>();
	int i = range.getStartOffset();
	int s = -1;
	int prevType = Character.MATH_SYMBOL;
	while (i < range.getEndOffset()) {
	final char ch = text.charAt(i);
	if (ch >= '\u3040' && ch <= '\u309f' \|\| // Hiragana
	ch >= '\u30A0' && ch <= '\u30ff' \|\| // Katakana
	ch >= '\u4E00' && ch <= '\u9FFF' \|\| // CJK Unified ideographs
	ch >= '\uF900' && ch <= '\uFAFF' \|\| // CJK Compatibility Ideographs
	ch >= '\uFF00' && ch <= '\uFFEF' //Halfwidth and Fullwidth Forms of Katakana & Fullwidth ASCII variants
	) {
	if (s >= 0) {
	add(text, result, i, s);
	s = -1;
	}
	prevType = Character.MATH_SYMBOL;
	++i;
	continue;
	}

	final int type = Character.getType(ch);
	if (type == Character.LOWERCASE_LETTER \|\|
	type == Character.UPPERCASE_LETTER \|\|
	type == Character.TITLECASE_LETTER \|\|
	type == Character.OTHER_LETTER \|\|
	type == Character.MODIFIER_LETTER \|\|
	type == Character.OTHER_PUNCTUATION
	) {
	//letter
	if (s < 0) {
	//start
	s = i;
	}
	else if (s >= 0 && type == Character.UPPERCASE_LETTER && prevType == Character.LOWERCASE_LETTER) {
	//a\|Camel
	add(text, result, i, s);
	s = i;
	}
	else if (i - s >= 1 && type == Character.LOWERCASE_LETTER && prevType == Character.UPPERCASE_LETTER) {
	//CAPITALN\|ext
	add(text, result, i - 1, s);
	s = i - 1;
	}
	}
	else if (s >= 0) {
	//non-letter
	add(text, result, i, s);
	s = -1;
	}
	prevType = type;
	i++;
	}
	//remainder
	if (s >= 0) {
	add(text, result, i, s);
	}
	return result;
	}

	private static void add(String text, List<TextRange> result, int i, int s) {
	if (i - s > 3) {
	final TextRange textRange = new TextRange(s, i);
	//System.out.println("textRange = " + textRange + " = "+ textRange.substring(text));
	result.add(textRange);
	}
	}
	}