| #!/bin/sh |
| |
| # vim: indentexpr= nosmartindent autoindent |
| # vim: tabstop=2 shiftwidth=2 softtabstop=2 |
| |
| # See the comments in regex/sentence.sh for the general approach to how this |
| # regex was written. |
| # |
| # Writing the regex for this was *hard*. It took me two days of hacking to get |
| # this far, and that was after I had finished the sentence regex, so my brain |
| # was fully cached on this. Unlike the sentence regex, the rules in the regex |
| # below don't correspond as nicely to the rules in UAX #29. In particular, the |
| # UAX #29 rules have a ton of overlap with each other, which requires crazy |
| # stuff in the regex. I'm not even sure the regex below is 100% correct or even |
| # minimal, however, I did compare this with the ICU word segmenter on a few |
| # different corpora, and it produces identical results. (In addition to of |
| # course passing the UCD tests.) |
| # |
| # In general, I consider this approach to be a failure. Firstly, this is |
| # clearly a write-only regex. Secondly, building the minimized DFA for this is |
| # incredibly slow. Thirdly, the DFA is itself very large (~240KB). Fourthly, |
| # reversing this regex (for reverse word iteration) results in a >19MB DFA. |
| # Yes. That's MB. Wat. And it took 5 minutes to build. |
| # |
| # I think we might consider changing our approach to this problem. The normal |
| # path I've seen, I think, is to decode codepoints one at a time, and then |
| # thread them through a state machine in the code itself. We could take this |
| # approach, or possibly combine it with a DFA that tells us which Word_Break |
| # value a codepoint has. I'd prefer the latter approach, but it requires adding |
| # RegexSet support to regex-automata. Something that should definitely be done, |
| # but is a fair amount of work. |
| # |
| # Gah. |
| |
| CR="\p{wb=CR}" |
| LF="\p{wb=LF}" |
| Newline="\p{wb=Newline}" |
| ZWJ="\p{wb=ZWJ}" |
| RI="\p{wb=Regional_Indicator}" |
| Katakana="\p{wb=Katakana}" |
| HebrewLet="\p{wb=HebrewLetter}" |
| ALetter="\p{wb=ALetter}" |
| SingleQuote="\p{wb=SingleQuote}" |
| DoubleQuote="\p{wb=DoubleQuote}" |
| MidNumLet="\p{wb=MidNumLet}" |
| MidLetter="\p{wb=MidLetter}" |
| MidNum="\p{wb=MidNum}" |
| Numeric="\p{wb=Numeric}" |
| ExtendNumLet="\p{wb=ExtendNumLet}" |
| WSegSpace="\p{wb=WSegSpace}" |
| |
| Any="\p{any}" |
| Ex="[\p{wb=Extend} \p{wb=Format} $ZWJ]" |
| ExtendPict="\p{Extended_Pictographic}" |
| AHLetter="[$ALetter $HebrewLet]" |
| MidNumLetQ="[$MidNumLet $SingleQuote]" |
| |
| AHLetterRepeat="$AHLetter $Ex* ([$MidLetter $MidNumLetQ] $Ex* $AHLetter $Ex*)*" |
| NumericRepeat="$Numeric $Ex* ([$MidNum $MidNumLetQ] $Ex* $Numeric $Ex*)*" |
| |
| echo "(?x) |
| $CR $LF |
| | |
| [$Newline $CR $LF] |
| | |
| $WSegSpace $WSegSpace+ |
| | |
| ( |
| ([^$Newline $CR $LF]? $Ex* $ZWJ $ExtendPict $Ex*)+ |
| | |
| ($ExtendNumLet $Ex*)* $AHLetter $Ex* |
| ( |
| ( |
| ($NumericRepeat | $ExtendNumLet $Ex*)* |
| | |
| [$MidLetter $MidNumLetQ] $Ex* |
| ) |
| $AHLetter $Ex* |
| )+ |
| ($NumericRepeat | $ExtendNumLet $Ex*)* |
| | |
| ($ExtendNumLet $Ex*)* $AHLetter $Ex* ($NumericRepeat | $ExtendNumLet $Ex*)+ |
| | |
| ($ExtendNumLet $Ex*)* $Numeric $Ex* |
| ( |
| ( |
| ($AHLetterRepeat | $ExtendNumLet $Ex*)* |
| | |
| [$MidNum $MidNumLetQ] $Ex* |
| ) |
| $Numeric $Ex* |
| )+ |
| ($AHLetterRepeat | $ExtendNumLet $Ex*)* |
| | |
| ($ExtendNumLet $Ex*)* $Numeric $Ex* ($AHLetterRepeat | $ExtendNumLet $Ex*)+ |
| | |
| $Katakana $Ex* |
| (($Katakana | $ExtendNumLet) $Ex*)+ |
| | |
| $ExtendNumLet $Ex* |
| (($ExtendNumLet | $AHLetter | $Numeric | $Katakana) $Ex*)+ |
| )+ |
| | |
| $HebrewLet $Ex* $SingleQuote $Ex* |
| | |
| ($HebrewLet $Ex* $DoubleQuote $Ex*)+ $HebrewLet $Ex* |
| | |
| $RI $Ex* $RI $Ex* |
| | |
| $Any $Ex* |
| " |