| package org.unicode.cldr.tool; |
| |
| import com.google.common.base.Joiner; |
| import com.google.common.collect.ImmutableSet; |
| import com.ibm.icu.impl.Relation; |
| import com.ibm.icu.impl.Row.R4; |
| import com.ibm.icu.text.BreakIterator; |
| import com.ibm.icu.util.Output; |
| import com.ibm.icu.util.ULocale; |
| import java.io.File; |
| import java.io.FileReader; |
| import java.io.IOException; |
| import java.io.OutputStreamWriter; |
| import java.io.Reader; |
| import java.io.Writer; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.EnumSet; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.LinkedHashMap; |
| import java.util.LinkedHashSet; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| import java.util.Set; |
| import java.util.Stack; |
| import java.util.TreeMap; |
| import java.util.TreeSet; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| import org.unicode.cldr.tool.Option.Options; |
| import org.unicode.cldr.util.CLDRPaths; |
| import org.unicode.cldr.util.CLDRTool; |
| import org.unicode.cldr.util.ChainedMap; |
| import org.unicode.cldr.util.ChainedMap.M4; |
| import org.unicode.cldr.util.CldrUtility; |
| import org.unicode.cldr.util.Counter; |
| import org.unicode.cldr.util.DtdData; |
| import org.unicode.cldr.util.DtdData.Attribute; |
| import org.unicode.cldr.util.DtdData.Element; |
| import org.unicode.cldr.util.DtdType; |
| import org.unicode.cldr.util.Pair; |
| import org.unicode.cldr.util.PathUtilities; |
| import org.unicode.cldr.util.PatternCache; |
| import org.unicode.cldr.util.RegexUtilities; |
| import org.unicode.cldr.util.SimpleHtmlParser; |
| import org.unicode.cldr.util.SimpleHtmlParser.Type; |
| import org.unicode.cldr.util.TransliteratorUtilities; |
| |
| @CLDRTool( |
| alias = "checkhtmlfiles", |
| description = "Look for errors in CLDR documentation tools", |
| hidden = "Used for CLDR process") |
| public class CheckHtmlFiles { |
| |
| static final Set<String> NOPOP = |
| new HashSet<>( |
| Arrays.asList("br", "img", "link", "meta", "!doctype", "hr", "col", "input")); |
| |
| static final EnumSet<Type> SUPPRESS = |
| EnumSet.of( |
| Type.ELEMENT, |
| Type.ELEMENT_START, |
| Type.ELEMENT_END, |
| Type.ELEMENT_POP, |
| Type.ATTRIBUTE, |
| Type.ATTRIBUTE_CONTENT); |
| |
| static final Options myOptions = new Options(); |
| static final Writer LOG = new OutputStreamWriter(System.out); |
| static Pattern WELLFORMED_HEADER = PatternCache.get("\\s*(\\d+(\\.\\d+)*\\s*).*"); |
| static Pattern SUPPRESS_SECTION_NUMBER = |
| PatternCache.get( |
| "(Annex [A-Z]: .*)" |
| + "|(Appendix [A-Z].*)" |
| + "|(.*Migrati(on|ng).*)" |
| + "|Step \\d+.*" |
| + "|Example \\d+.*" |
| + "|D\\d+\\.\\s.*" |
| + "|References" |
| + "|Acknowledge?ments" |
| + "|Rights to .*Images" |
| + "|Modifications" |
| + "|(Revision \\d+\\.?)"); |
| static Pattern SUPPRESS_REVISION = PatternCache.get("Revision \\d+\\.?"); |
| static Pattern SPACES = PatternCache.get("\\s+"); |
| |
| enum MyOptions { |
| // old(".*", Settings.OTHER_WORKSPACE_DIRECTORY + |
| // "cldr-archive/cldr-22.1/specs/ldml/tr35\\.html", "source data (regex)"), |
| target( |
| ".*", |
| CLDRPaths.BASE_DIRECTORY |
| + "specs" |
| + File.separator |
| + "ldml" |
| + File.separator |
| + "tr35(-.*)?\\.html", |
| "target data (regex); ucd for Unicode docs; " |
| + "for others use the format -t ${workspace_loc}/unicode-draft/reports/tr51/tr51.html"), |
| verbose(".*", "none", "verbose debugging messages"), |
| // contents(".*", CLDRPaths.BASE_DIRECTORY + "specs/ldml/tr35(-.*)?\\.html", "generate |
| // contents"), |
| // /cldr-archive |
| ; |
| |
| // boilerplate |
| final Option option; |
| |
| MyOptions(String argumentPattern, String defaultArgument, String helpText) { |
| option = myOptions.add(this, argumentPattern, defaultArgument, helpText); |
| } |
| } |
| |
| enum Verbosity { |
| none, |
| element, |
| all; |
| |
| static Verbosity of(String input) { |
| return input == null |
| ? Verbosity.none |
| : Verbosity.valueOf(input.toLowerCase(Locale.ROOT)); |
| } |
| } |
| |
| static Verbosity verbose; |
| static boolean doContents; |
| static boolean isLdml; |
| |
| public static void main(String[] args) throws IOException { |
| System.out.println( |
| "First do a replace of <a\\s+name=\"([^\"]*)\"\\s*> by <a name=\"$1\" href=\"#$1\">"); |
| System.out.println("Then check for all links with no anchors: <a([^>]*)></a>"); |
| System.out.println( |
| "Then check for all links that don't start with name or href <a (?!href|name)"); |
| |
| myOptions.parse(MyOptions.target, args, true); |
| verbose = Verbosity.of(MyOptions.verbose.option.getValue()); |
| |
| String targetString = MyOptions.target.option.getValue(); |
| if (targetString.contains("ldml")) { |
| isLdml = true; |
| } |
| if (targetString.equalsIgnoreCase("ucd")) { |
| targetString = |
| CLDRPaths.BASE_DIRECTORY + "../unicode-draft/reports/tr(\\d+)/tr(\\d+).html"; |
| } else if (targetString.equalsIgnoreCase("security")) { |
| targetString = |
| CLDRPaths.BASE_DIRECTORY + "../unicode-draft/reports/tr(3[69])/tr(3[69]).html"; |
| } |
| Data target = new Data().getSentences(targetString); |
| if (target.count == 0) { |
| throw new IllegalArgumentException("No files matched with " + targetString); |
| } |
| |
| if (isLdml) { |
| checkForDtd(target); |
| } |
| |
| System.out.println( |
| "*TOTAL COUNTS* files:" |
| + target.count |
| + ", fatal errors:" |
| + target.totalFatalCount |
| + ", nonfatal errors:" |
| + target.totalErrorCount); |
| if (target.totalFatalCount > 0 || target.totalErrorCount > 0) { |
| System.exit(1); // give an error status |
| } |
| |
| System.exit(0); |
| |
| // Data source = new Data().getSentences(MyOptions.old.option.getValue()); |
| // String file = MyOptions.target.option.getValue(); |
| // |
| // Data target = new Data().getSentences(file); |
| // |
| // int missingCount = 0, extraCount = 0; |
| // int line = 0; |
| // for (String sentence : source) { |
| // ++line; |
| // long sourceCount = source.getCount(sentence); |
| // long targetCount = target.getCount(sentence); |
| // if (targetCount == 0) { |
| // System.out.println(line + "\tMISSING:\t" + sourceCount + "≠" + targetCount |
| // + "\t" + sentence); |
| // ++missingCount; |
| // } |
| // } |
| // line = 0; |
| // for (String sentence : target) { |
| // ++line; |
| // long sourceCount = source.getCount(sentence); |
| // long targetCount = target.getCount(sentence); |
| // if (sourceCount == 0) { |
| // System.out.println(line + "\tEXTRA:\t" + targetCount + "≠" + sourceCount + |
| // "\t" + sentence); |
| // ++extraCount; |
| // } |
| // } |
| // System.out.println("Missing:\t" + missingCount); |
| // System.out.println("Extra:\t" + extraCount); |
| } |
| |
| private static final Set<String> SKIP_ATTR = |
| ImmutableSet.of("draft", "alt", "references", "cldrVersion", "unicodeVersion"); |
| |
| private static void checkForDtd(Data target) { |
| M4<String, String, DtdType, Boolean> typeToElements = |
| ChainedMap.of(new TreeMap(), new TreeMap(), new TreeMap(), Boolean.class); |
| for (DtdType type : DtdType.values()) { |
| if (type.getStatus() != DtdType.DtdStatus.active) { |
| continue; |
| } else if (type == DtdType.ldmlICU) { |
| continue; |
| } |
| DtdData dtdData = DtdData.getInstance(type); |
| Set<Element> elements = dtdData.getElements(); |
| for (Element element : elements) { |
| if (element.isDeprecated() |
| || element.equals(dtdData.PCDATA) |
| || element.equals(dtdData.ANY)) continue; |
| typeToElements.put(element.name, element.toDtdString(), type, Boolean.TRUE); |
| } |
| Set<Attribute> attributes = dtdData.getAttributes(); |
| for (Attribute attribute : attributes) { |
| if (attribute.isDeprecated()) continue; |
| if (SKIP_ATTR.contains(attribute.name)) { |
| continue; |
| } |
| typeToElements.put( |
| attribute.element.name, |
| attribute.appendDtdString(new StringBuilder()).toString(), |
| type, |
| Boolean.TRUE); |
| } |
| } |
| final Map<String, String> skeletonToInFile = new HashMap<>(); |
| Relation<String, String> extra = new Relation(new TreeMap(), TreeSet.class); |
| for (R4<String, String, String, Boolean> elementItem : target.dtdItems.rows()) { |
| String file = elementItem.get0(); |
| String element = elementItem.get1(); |
| String item = elementItem.get2(); |
| extra.put(element, item); |
| skeletonToInFile.put(item.replace(" ", ""), item); |
| } |
| ChainedMap.M4<String, String, DtdType, Comparison> status = |
| ChainedMap.of(new TreeMap(), new TreeMap(), new TreeMap(), Comparison.class); |
| for (R4<String, String, DtdType, Boolean> entry : typeToElements.rows()) { |
| final String element = entry.get0(); |
| final String key = entry.get1(); |
| final DtdType dtdType = entry.get2(); |
| String spaceless = key.replace(" ", ""); |
| String realKey = skeletonToInFile.get(spaceless); |
| if (realKey == null) { |
| status.put(element, key, dtdType, Comparison.missing); |
| } else { |
| boolean found = extra.remove(element, realKey); |
| if (!found) { |
| status.put(element, key, dtdType, Comparison.no_rem); |
| } |
| } |
| } |
| for (Entry<String, String> extraItem : extra.entrySet()) { |
| status.put(extraItem.getKey(), extraItem.getValue(), DtdType.ldmlICU, Comparison.extra); |
| } |
| TreeSet<String> reverse = new TreeSet<>(Collections.reverseOrder()); |
| for (Entry<String, Map<String, Map<DtdType, Comparison>>> entry1 : status) { |
| String element = entry1.getKey(); |
| reverse.clear(); |
| final Map<String, Map<DtdType, Comparison>> itemToDtdTypeToComparison = |
| entry1.getValue(); |
| reverse.addAll(itemToDtdTypeToComparison.keySet()); |
| for (String item : reverse) { |
| Map<DtdType, Comparison> typeToComparison = itemToDtdTypeToComparison.get(item); |
| for (Entry<DtdType, Comparison> entry2 : typeToComparison.entrySet()) { |
| System.out.println( |
| element |
| + "\t" |
| + entry2.getValue() |
| + "\t" |
| + CldrUtility.ifSame(entry2.getKey(), DtdType.ldmlICU, "") |
| + "\t" |
| + item); |
| } |
| } |
| } |
| } |
| |
| enum Comparison { |
| missing, |
| extra, |
| no_rem |
| } |
| |
| static Pattern WHITESPACE = PatternCache.get("[\\s]+"); |
| static Pattern BADSECTION = PatternCache.get("^\\s*(\\d+\\s*)?Section\\s*\\d+\\s*[-:]\\s*"); |
| |
| static final Set<String> FORCEBREAK = |
| new HashSet<>( |
| Arrays.asList( |
| "table", |
| "div", |
| "blockquote", |
| "p", |
| "br", |
| "td", |
| "th", |
| "h1", |
| "h2", |
| "h3", |
| "h4", |
| "h5", |
| "li")); |
| |
| // enum ContentsElements {h1, h2, h3, h4, h5, caption} |
| |
| static final Set<String> DO_CONTENTS = |
| new HashSet<>(Arrays.asList("h1", "h2", "h3", "h4", "h5", "caption")); |
| |
| static class Levels implements Comparable<Levels> { |
| final int[] levels = new int[10]; |
| final int h2_start; |
| |
| public Levels(int h2_start) { |
| levels[0] = h2_start; // special adjustment of starting header level |
| this.h2_start = h2_start; |
| } |
| |
| public Levels() { |
| this(0); |
| } |
| |
| /** |
| * h2 = level 0, h3 is level 1, etc. |
| * |
| * @param level |
| * @return |
| */ |
| Levels next(int level, Output<Boolean> missingLevel) { |
| level -= 2; // h2 = level 0 |
| missingLevel.value = false; |
| if (levels[0] < h2_start) { |
| missingLevel.value = true; |
| } |
| for (int i = 1; i < level; ++i) { |
| if (levels[i] == 0) { |
| missingLevel.value = true; |
| } |
| } |
| levels[level]++; |
| for (int i = level + 1; i < levels.length; ++i) { |
| levels[i] = 0; |
| } |
| return this; |
| } |
| |
| public int getDepth() { |
| for (int i = 0; ; ++i) { |
| int level = levels[i]; |
| if (level == 0) { |
| return i - 1; |
| } |
| } |
| } |
| |
| @Override |
| public String toString() { |
| StringBuilder b = new StringBuilder(); |
| for (int i = 0; ; ++i) { |
| int level = levels[i]; |
| if (level == 0) { |
| return b.toString(); |
| } |
| if (b.length() != 0) { |
| b.append('.'); |
| } |
| b.append(level); |
| } |
| } |
| |
| public static Levels parse(String group) { |
| Levels result = new Levels(); |
| int currentLevel = 0; |
| for (int i = 0; i < group.length(); ++i) { |
| char ch = group.charAt(i); |
| if (ch == '.') { |
| currentLevel++; |
| } else { |
| ch -= '0'; |
| if (ch > '9') { |
| break; |
| } |
| result.levels[currentLevel] = result.levels[currentLevel] * 10 + ch; |
| } |
| } |
| return result; |
| } |
| |
| @Override |
| public int compareTo(Levels other) { |
| for (int i = 0; i < levels.length; ++i) { |
| if (levels[i] != other.levels[i]) { |
| return levels[i] < other.levels[i] ? -1 : 1; |
| } |
| } |
| return 0; |
| } |
| |
| public void set(Levels other) { |
| for (int i = 0; i < levels.length; ++i) { |
| levels[i] = other.levels[i]; |
| } |
| } |
| } |
| |
| static class HeadingInfo { |
| private Levels levels = new Levels(); |
| private String text = ""; |
| private Set<String> ids = new LinkedHashSet<>(); |
| private boolean suppressSection; |
| private boolean isHeader; |
| |
| // temporary |
| private int level; |
| |
| public void setLevel(String headingLabel, HeadingInfo lastHeading) { |
| isHeader = !headingLabel.equals("caption"); |
| level = isHeader ? headingLabel.charAt(1) - '0' : lastHeading.level; |
| } |
| |
| @Override |
| public String toString() { |
| // <h3><a name="Identity_Elements" href="#Identity_Elements">5.3 Identity |
| // Elements</a></h3> |
| String id = ids.isEmpty() ? "NOID" : ids.iterator().next(); |
| String result = |
| "<" |
| + getLabel() |
| + "<a name=\"" |
| + id |
| + "\" href=\"#" |
| + id |
| + "\">" |
| + (!isHeader ? "" : suppressSection ? "" : levels + " ") |
| + TransliteratorUtilities.toHTML.transform(text) |
| + "</a>"; |
| if (ids.size() > 1) { |
| boolean first = true; |
| for (String id2 : ids) { |
| if (first) { |
| first = false; |
| } else { |
| result += "<a name=\"" + id2 + "\"></a>"; |
| } |
| } |
| } |
| return result + "</" + getLabel(); |
| } |
| |
| public String getLabel() { |
| return isHeader ? "h" + level + ">" : "caption>"; |
| } |
| |
| public String toHeader() { |
| String id = ids.iterator().next(); |
| return ("<li>" |
| + (!isHeader |
| ? (text.contains("Table") || text.contains("Figure") ? "" : "Table: ") |
| : suppressSection ? "" : levels + " ") |
| + "<a href=\"#" |
| + id |
| + "\">" |
| + TransliteratorUtilities.toHTML.transform(text) |
| + "</a>"); |
| } |
| |
| public void addText(String toAppend) { |
| String temp = TransliteratorUtilities.fromHTML.transform(toAppend); |
| if (text.isEmpty()) { |
| if (temp.startsWith(" ")) { |
| text = temp.substring(1); |
| } else { |
| text = temp; |
| } |
| } else { |
| text += temp; |
| } |
| text = |
| SPACES.matcher(text) |
| .replaceAll(" "); // clean up all spaces; make more efficient later |
| // used to trim, but we need to retain space between elements. So only trim the start, |
| // and later, the end |
| } |
| |
| public boolean isContents() { |
| return text.toString().startsWith("Contents"); |
| } |
| |
| void addId(String id) { |
| this.ids.add(id); |
| } |
| |
| public void setLevels(int line, Levels levels, Set<String> errors) { |
| this.levels.set(levels); |
| String error = ""; |
| if (badSectionMatcher.reset(text).find()) { |
| text = text.substring(badSectionMatcher.end()); |
| error += "Extra 'Section...' at start; "; |
| } |
| if (isHeader) { |
| if (!headerMatcher.reset(text).matches()) { |
| if (!SUPPRESS_SECTION_NUMBER.matcher(text).matches()) { |
| error += "Missing section numbers; "; |
| } |
| } else { |
| text = text.substring(headerMatcher.end(1)); |
| if (text.startsWith(".")) { |
| text = text.substring(1).trim(); |
| error += "Extra . at start; "; |
| } |
| Levels parsedLevels = Levels.parse(headerMatcher.group(1)); |
| if (levels.compareTo(parsedLevels) != 0) { |
| error += "Section numbers mismatch, was " + parsedLevels + "; "; |
| } |
| } |
| } |
| if (ids.isEmpty()) { |
| addId(text.toString().trim().replaceAll("[^A-Za-z0-9]+", "_")); |
| error += "Missing double link"; |
| } |
| if (!error.isEmpty()) { |
| errors.add(this + "\t<!-- " + line + ": " + error + " -->"); |
| } |
| suppressSection = SUPPRESS_SECTION_NUMBER.matcher(text).matches(); |
| } |
| |
| public void addIds(Counter<String> idCounter) { |
| for (String id : ids) { |
| idCounter.add(id, 1); |
| } |
| } |
| |
| public HeadingInfo fixText() { |
| if (text.endsWith(" ")) { |
| text = text.substring(0, text.length() - 1); |
| } |
| return this; |
| } |
| } |
| |
| static Matcher headerMatcher = WELLFORMED_HEADER.matcher(""); |
| static Matcher badSectionMatcher = BADSECTION.matcher(""); |
| |
| static class HeadingInfoList { |
| private static final long serialVersionUID = -6722150173224993960L; |
| Levels lastBuildLevel; |
| private Set<String> errors = new LinkedHashSet<>(); |
| Output<Boolean> missingLevel = new Output<>(false); |
| private String fileName; |
| ArrayList<HeadingInfo> list = new ArrayList<>(); |
| |
| public HeadingInfoList(String fileName, int h2_START) { |
| this.fileName = fileName; |
| lastBuildLevel = new Levels(h2_START); |
| } |
| |
| public boolean add(int line, HeadingInfo h) { |
| h.fixText(); |
| if (SUPPRESS_REVISION.matcher(h.text).matches()) { |
| return false; |
| } |
| if (h.isHeader) { |
| h.setLevels(line, lastBuildLevel.next(h.level, missingLevel), errors); |
| } else { |
| h.setLevels(line, lastBuildLevel, errors); |
| } |
| if (missingLevel.value) { |
| errors.add("FATAL: Missing Level in: " + h); |
| } |
| return list.add(h); |
| } |
| |
| static final String PAD = "\t"; |
| |
| public void listContents() { |
| |
| System.out.print("\n\t\t<!-- START Generated TOC: CheckHtmlFiles -->"); |
| Counter<String> idCounter = new Counter<>(); |
| |
| int lastLevel = new Levels().getDepth(); |
| String pad = PAD; |
| int ulCount = 0; |
| int liCount = 0; |
| for (HeadingInfo h : list) { |
| h.addIds(idCounter); |
| final int depth = h.levels.getDepth() + (h.isHeader ? 0 : 1); |
| int levelDiff = depth - lastLevel; |
| lastLevel = depth; |
| if (levelDiff > 0) { |
| System.out.println(); |
| for (int i = 0; i < levelDiff; ++i) { |
| pad += PAD; |
| System.out.println(pad + "<ul class=\"toc\">"); |
| ++ulCount; |
| } |
| pad += PAD; |
| } else if (levelDiff < 0) { |
| System.out.println("</li>"); |
| --liCount; |
| for (int i = 0; i > levelDiff; --i) { |
| pad = pad.substring(PAD.length()); |
| System.out.println(pad + "</ul>"); |
| --ulCount; |
| pad = pad.substring(PAD.length()); |
| System.out.println(pad + "</li>"); |
| --liCount; |
| } |
| } else { |
| System.out.println("</li>"); |
| --liCount; |
| } |
| |
| System.out.print(pad + h.toHeader()); |
| ++liCount; |
| |
| // <li>1.1 <a href="#Conformance">Conformance</a></li> |
| |
| // <ul class="toc"> |
| // <li>1 <a href="#Introduction">Introduction</a> |
| // <ul class="toc"> |
| // <li>1.1 <a href="#Conformance">Conformance</a> |
| // </li> |
| // ... |
| // </ul> |
| // </li> |
| } |
| |
| // finish up and make sure we are balances |
| |
| int levelDiff = -lastLevel; |
| System.out.println("</li>"); |
| --liCount; |
| for (int i = 0; i > levelDiff; --i) { |
| pad = pad.substring(PAD.length()); |
| System.out.println(pad + "</ul>"); |
| --ulCount; |
| pad = pad.substring(PAD.length()); |
| System.out.println(pad + "</li>"); |
| --liCount; |
| } |
| pad = pad.substring(PAD.length()); |
| System.out.println(pad + "</ul>"); |
| System.out.println(pad + "<!-- END Generated TOC: CheckHtmlFiles -->"); |
| --ulCount; |
| if (liCount != 0 || ulCount != 0) { |
| throw new IllegalArgumentException( |
| "Mismatched counts in generated contents, li:" |
| + liCount |
| + ", ul:" |
| + ulCount); |
| } |
| for (String id : idCounter) { |
| long count = idCounter.get(id); |
| if (count != 1) { |
| errors.add("FATAL: Non-Unique ID: " + id); |
| } |
| } |
| } |
| |
| /** |
| * Prints out errs |
| * |
| * @return fatal err count |
| */ |
| public int showErrors() { |
| int fatalCount = 0; |
| if (!errors.isEmpty()) { |
| System.out.println("\n*ERRORS*\n"); |
| for (String error : errors) { |
| if (error.startsWith("FATAL:")) { |
| System.out.println(fileName + "\t" + error); |
| fatalCount++; |
| } |
| } |
| if (fatalCount == 0) { |
| for (String error : errors) { |
| System.out.println(fileName + "\t" + error); |
| } |
| } |
| } |
| if (this.list.size() == 0) { |
| System.out.println("No header items (eg <h2>) captured."); |
| fatalCount = 1; |
| } |
| return fatalCount; |
| } |
| |
| /** |
| * @return total number of errors |
| */ |
| public int totalErrorCount() { |
| return errors.size(); |
| } |
| } |
| |
| static class ElementLine { |
| final String element; |
| final int line; |
| |
| public ElementLine(String element, int line) { |
| super(); |
| this.element = element; |
| this.line = line; |
| } |
| |
| @Override |
| public String toString() { |
| return element + '[' + line + ']'; |
| } |
| } |
| |
| static class Data implements Iterable<String> { |
| private static final Pattern ELEMENT_ATTLIST = |
| Pattern.compile("<!(ELEMENT|ATTLIST)\\s+(\\S+)[^>]*>"); |
| List<String> sentences = new ArrayList<>(); |
| M4<String, String, String, Boolean> dtdItems = |
| ChainedMap.of( |
| new LinkedHashMap<String, Object>(), |
| new TreeMap<String, Object>(), |
| new TreeMap<String, Object>(), |
| Boolean.class); |
| Counter<String> hashedSentences = new Counter<>(); |
| int count = 0; |
| int totalErrorCount = 0; |
| int totalFatalCount = 0; |
| |
| public Data getSentences(String fileRegex) throws IOException { |
| String base; |
| String regex; |
| try { |
| int firstParen = fileRegex.indexOf('('); |
| if (firstParen < 0) { |
| firstParen = fileRegex.length(); |
| } |
| int lastSlash = fileRegex.lastIndexOf(File.separatorChar, firstParen); |
| base = fileRegex.substring(0, lastSlash); |
| regex = fileRegex.substring(lastSlash + 1); |
| } catch (Exception e) { |
| throw new IllegalArgumentException( |
| "Target file must be in special format. " |
| + "Up to the first path part /.../ containing a paragraph is constant, and the rest is a regex."); |
| } |
| |
| // File sourceFile = new File(fileRegex); |
| File sourceDirectory = new File(base); |
| if (!sourceDirectory.exists()) { |
| throw new IllegalArgumentException("Can't find " + sourceDirectory); |
| } |
| String canonicalBase = PathUtilities.getNormalizedPathString(sourceDirectory); |
| String FileRegex = canonicalBase + File.separator + regex; |
| FileRegex = FileRegex.replace("\\", "\\\\"); |
| FileRegex = FileRegex.replace("\\\\.", "\\."); |
| Matcher m = PatternCache.get(FileRegex).matcher(""); |
| System.out.println("Matcher: " + m); |
| |
| return getSentences(sourceDirectory, m); |
| } |
| |
| public Data getSentences(File sourceDirectory, Matcher m) throws IOException { |
| // System.out.println("Processing:\t" + sourceDirectory); |
| for (File file : sourceDirectory.listFiles()) { |
| if (file.isDirectory()) { |
| getSentences(file, m); |
| continue; |
| } |
| String fileString = file.getCanonicalFile().toString(); |
| File fileCanonical = new File(fileString); |
| if (!m.reset(fileString).matches()) { |
| if (verbose == Verbosity.all) { |
| System.out.println( |
| "Skipping: " |
| + RegexUtilities.showMismatch(m, fileString) |
| + "\t" |
| + sourceDirectory); |
| } |
| continue; |
| } |
| |
| System.out.println( |
| "\nProcessing:\t" + sourceDirectory + File.separator + fileString); |
| |
| int H2_START = fileString.contains("tr18") ? -1 : 0; |
| try (Reader in = new FileReader(fileCanonical)) { |
| parseFile(fileCanonical, H2_START, in); |
| } |
| } |
| return this; |
| } |
| |
| SimpleHtmlParser parser = new SimpleHtmlParser(); |
| |
| public void parseFile(File fileCanonical, int H2_START, Reader in) throws IOException { |
| Matcher wsMatcher = WHITESPACE.matcher(""); |
| ++count; |
| // SimpleHtmlParser parser = new SimpleHtmlParser().setReader(in); |
| parser.setReader(in); |
| StringBuilder buffer = new StringBuilder(); |
| StringBuilder content = new StringBuilder(); |
| HeadingInfo heading = new HeadingInfo(); |
| final String fileName = fileCanonical.getName(); |
| HeadingInfoList headingInfoList = new HeadingInfoList(fileName, H2_START); |
| Stack<ElementLine> elementStack = new Stack<>(); |
| Stack<Pair<String, String>> attributeStack = new Stack<>(); |
| String contentString; |
| boolean inHeading = false; |
| boolean inPop = false; |
| boolean inAnchor = false; |
| boolean haveContents = false; |
| HeadingInfo lastHeading = null; |
| // for detecting missing captions |
| boolean pushedTable = false; |
| boolean checkCaption = false; |
| List<Integer> captionWarnings = new ArrayList<>(); |
| |
| main: |
| while (true) { |
| int lineCount = parser.getLineCount(); |
| Type x = parser.next(content); |
| if (verbose == Verbosity.all && !SUPPRESS.contains(x)) { |
| LOG.write(parser.getLineCount() + "\t" + x + ":\t«" + content + "»"); |
| // SimpleHtmlParser.writeResult(x, content, LOG); |
| LOG.write("\n"); |
| LOG.flush(); |
| } |
| switch (x) { |
| case QUOTE: |
| contentString = content.toString().toLowerCase(Locale.ENGLISH).trim(); |
| if (contentString.equalsIgnoreCase("nocaption")) { |
| pushedTable = false; |
| } |
| break; |
| case ATTRIBUTE: |
| contentString = content.toString().toLowerCase(Locale.ENGLISH); |
| if (inHeading |
| && (contentString.equals("name") || contentString.equals("id"))) { |
| inAnchor = true; |
| } else { |
| inAnchor = false; |
| } |
| attributeStack.add(new Pair<String, String>(contentString, null)); |
| break; |
| case ATTRIBUTE_CONTENT: |
| contentString = content.toString().toLowerCase(Locale.ENGLISH); |
| if (inAnchor) { |
| heading.addId(content.toString()); |
| } |
| Pair<String, String> lastAttribute = attributeStack.peek(); |
| if (lastAttribute.getSecond() != null) { |
| System.out.println( |
| lineCount |
| + "\tDouble Attribute: " |
| + contentString |
| + ", peek=" |
| + lastAttribute); |
| } else { |
| lastAttribute.setSecond(contentString); |
| } |
| break; |
| case ELEMENT: |
| contentString = content.toString().toLowerCase(Locale.ENGLISH); |
| if (inPop) { |
| ElementLine peek; |
| while (true) { |
| peek = elementStack.peek(); |
| if (!NOPOP.contains(peek.element)) { |
| break; |
| } |
| elementStack.pop(); |
| } |
| if (!peek.element.equals(contentString)) { |
| System.out.println( |
| lineCount |
| + "\tCouldn't pop: " |
| + contentString |
| + ", " |
| + showElementStack(elementStack)); |
| } else { |
| elementStack.pop(); |
| } |
| } else { |
| // check that the first element following a table is a caption |
| if (pushedTable && !"caption".equals(contentString)) { |
| captionWarnings.add(lineCount); |
| } |
| elementStack.push(new ElementLine(contentString, lineCount)); |
| pushedTable = checkCaption && "table".equals(contentString); |
| if (!checkCaption |
| && "h3".equals(contentString)) { // h3 around Summary in |
| // standard format |
| checkCaption = true; |
| } |
| } |
| if (verbose != Verbosity.none) { |
| LOG.write( |
| parser.getLineCount() |
| + "\telem:\t" |
| + showElementStack(elementStack) |
| + "\n"); |
| LOG.flush(); |
| } |
| if (FORCEBREAK.contains(contentString)) { |
| buffer.append("\n"); |
| } |
| if (DO_CONTENTS.contains(contentString)) { |
| if (inPop) { |
| if (inHeading) { |
| inHeading = false; |
| if (heading.isContents()) { |
| haveContents = true; |
| } else if (haveContents) { |
| headingInfoList.add(parser.getLineCount(), heading); |
| lastHeading = heading; |
| } |
| heading = new HeadingInfo(); |
| } |
| } else { |
| heading.setLevel(contentString, lastHeading); |
| inHeading = true; |
| } |
| } |
| break; |
| case ELEMENT_START: |
| inPop = false; |
| break; |
| case ELEMENT_END: |
| if (verbose == Verbosity.all && !attributeStack.isEmpty()) { |
| LOG.write( |
| parser.getLineCount() |
| + "\tattr:\t" |
| + showAttributeStack(attributeStack) |
| + System.lineSeparator()); |
| LOG.flush(); |
| } |
| attributeStack.clear(); |
| inPop = false; |
| break; |
| case ELEMENT_POP: |
| inPop = true; |
| break; |
| case ELEMENT_CONTENT: |
| contentString = |
| wsMatcher.reset(content).replaceAll(" ").replace(" ", " "); |
| buffer.append( |
| contentString.indexOf('&') >= 0 |
| ? TransliteratorUtilities.fromHTML.transform(contentString) |
| : contentString); |
| if (inHeading) { |
| heading.addText(contentString); |
| } |
| break; |
| case DONE: |
| break main; |
| default: |
| break; // skip everything else. |
| } |
| } |
| |
| // get DTD elements |
| Matcher m = ELEMENT_ATTLIST.matcher(buffer); |
| while (m.find()) { |
| dtdItems.put(fileName, m.group(2), m.group(), true); |
| // System.out.println(fileName + "\t" + m.group()); |
| } |
| BreakIterator sentenceBreak = BreakIterator.getSentenceInstance(ULocale.ENGLISH); |
| String bufferString = normalizeWhitespace(buffer); |
| sentenceBreak.setText(bufferString); |
| int last = 0; |
| while (true) { |
| int pos = sentenceBreak.next(); |
| if (pos == BreakIterator.DONE) { |
| break; |
| } |
| String sentence = bufferString.substring(last, pos).trim(); |
| last = pos; |
| if (sentence.isEmpty()) { |
| continue; |
| } |
| hashedSentences.add(sentence, 1); |
| sentences.add(sentence); |
| } |
| if (!captionWarnings.isEmpty()) { |
| System.out.println( |
| "WARNING: Missing <caption> on the following lines: " |
| + "\n " |
| + Joiner.on(", ").join(captionWarnings) |
| + "\n\tTo fix, add <caption> after the <table>, such as:" |
| + "\n\t\t<table>" |
| + "\n\t\t\t<caption>Private Use Codes in CLDR</a></caption>" |
| + "\n\tOften the sentence just before the <table> can be made into the caption." |
| + "\n\tThe next time you run this program, you’ll be prompted with double-links." |
| + "\n\tIf it really shouldn't have a caption, add <!-- nocaption --> after the <table> instead."); |
| } |
| int fatalCount = headingInfoList.showErrors(); |
| totalFatalCount += fatalCount; |
| totalErrorCount += headingInfoList.totalErrorCount(); |
| if (fatalCount == 0) { |
| headingInfoList.listContents(); |
| } else { |
| System.out.println( |
| "\nFix fatal errors in " |
| + fileCanonical |
| + " before contents can be generated"); |
| } |
| } |
| |
| private String showAttributeStack(Stack<Pair<String, String>> attributeStack) { |
| StringBuilder result = new StringBuilder(); |
| for (Pair<String, String> s : attributeStack) { |
| result.append("[@"); |
| result.append(s.getFirst()); |
| final String second = s.getSecond(); |
| if (second != null) { |
| result.append("='"); |
| result.append(second); |
| result.append("'"); |
| } |
| result.append("]"); |
| } |
| return result.toString(); |
| } |
| |
| private String showElementStack(Stack<ElementLine> elementStack) { |
| StringBuilder result = new StringBuilder(); |
| for (ElementLine s : elementStack) { |
| result.append('/').append(s); |
| } |
| return result.toString(); |
| } |
| |
| /** |
| * Return string after collapsing multiple whitespace containing '\\n' to '\\n', and |
| * otherwise 'space'. |
| * |
| * @param input |
| * @return |
| */ |
| private String normalizeWhitespace(CharSequence input) { |
| Matcher m = WHITESPACE.matcher(input); |
| StringBuilder buffer = new StringBuilder(); |
| int last = 0; |
| while (m.find()) { |
| int start = m.start(); |
| buffer.append(input.subSequence(last, start)); |
| last = m.end(); |
| String whiteString = m.group(); |
| if (whiteString.indexOf('\n') >= 0) { |
| buffer.append('\n'); |
| } else { |
| buffer.append(' '); |
| } |
| } |
| buffer.append(input.subSequence(last, input.length())); |
| return buffer.toString().trim(); |
| } |
| |
| public long getCount(String sentence) { |
| return hashedSentences.getCount(sentence); |
| } |
| |
| @Override |
| public Iterator<String> iterator() { |
| return sentences.iterator(); |
| } |
| } |
| } |