ICU 71.1  71.1
rbbi.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 * Copyright (C) 1999-2016 International Business Machines Corporation *
6 * and others. All rights reserved. *
7 ***************************************************************************
8 
9 **********************************************************************
10 * Date Name Description
11 * 10/22/99 alan Creation.
12 * 11/11/99 rgillam Complete port from Java.
13 **********************************************************************
14 */
15 
16 #ifndef RBBI_H
17 #define RBBI_H
18 
19 #include "unicode/utypes.h"
20 
21 #if U_SHOW_CPLUSPLUS_API
22 
28 #if !UCONFIG_NO_BREAK_ITERATION
29 
30 #include "unicode/brkiter.h"
31 #include "unicode/udata.h"
32 #include "unicode/parseerr.h"
33 #include "unicode/schriter.h"
34 
35 struct UCPTrie;
36 
37 U_NAMESPACE_BEGIN
38 
40 class LanguageBreakEngine;
41 struct RBBIDataHeader;
42 class RBBIDataWrapper;
43 class UnhandledEngine;
44 class UStack;
45 
58 
59 private:
64  UText fText;
65 
66 #ifndef U_HIDE_INTERNAL_API
67 public:
68 #endif /* U_HIDE_INTERNAL_API */
69 
74  RBBIDataWrapper *fData;
75 private:
76 
81  int32_t fPosition;
82 
86  int32_t fRuleStatusIndex;
87 
91  class BreakCache;
92  BreakCache *fBreakCache;
93 
98  class DictionaryCache;
99  DictionaryCache *fDictionaryCache;
100 
108  UStack *fLanguageBreakEngines;
109 
117  UnhandledEngine *fUnhandledBreakEngine;
118 
124  uint32_t fDictionaryCharCount;
125 
131  CharacterIterator *fCharIter;
132 
138  StringCharacterIterator fSCharIter;
139 
143  UBool fDone;
144 
148  int32_t *fLookAheadMatches;
149 
153  UBool fIsPhraseBreaking;
154 
155  //=======================================================================
156  // constructors
157  //=======================================================================
158 
169  RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
170 
184  RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status);
185 
187  friend class RBBIRuleBuilder;
189  friend class BreakIterator;
190 
191 public:
192 
198 
206 
216  UParseError &parseError,
217  UErrorCode &status);
218 
242  RuleBasedBreakIterator(const uint8_t *compiledRules,
243  uint32_t ruleLength,
244  UErrorCode &status);
245 
259 
264  virtual ~RuleBasedBreakIterator();
265 
274 
283  virtual bool operator==(const BreakIterator& that) const override;
284 
292  inline bool operator!=(const BreakIterator& that) const;
293 
304  virtual RuleBasedBreakIterator* clone() const override;
305 
311  virtual int32_t hashCode(void) const;
312 
318  virtual const UnicodeString& getRules(void) const;
319 
320  //=======================================================================
321  // BreakIterator overrides
322  //=======================================================================
323 
349  virtual CharacterIterator& getText(void) const override;
350 
351 
366  virtual UText *getUText(UText *fillIn, UErrorCode &status) const override;
367 
375  virtual void adoptText(CharacterIterator* newText) override;
376 
388  virtual void setText(const UnicodeString& newText) override;
389 
403  virtual void setText(UText *text, UErrorCode &status) override;
404 
410  virtual int32_t first(void) override;
411 
417  virtual int32_t last(void) override;
418 
429  virtual int32_t next(int32_t n) override;
430 
436  virtual int32_t next(void) override;
437 
443  virtual int32_t previous(void) override;
444 
452  virtual int32_t following(int32_t offset) override;
453 
461  virtual int32_t preceding(int32_t offset) override;
462 
471  virtual UBool isBoundary(int32_t offset) override;
472 
481  virtual int32_t current(void) const override;
482 
483 
515  virtual int32_t getRuleStatus() const override;
516 
540  virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) override;
541 
553  virtual UClassID getDynamicClassID(void) const override;
554 
566  static UClassID U_EXPORT2 getStaticClassID(void);
567 
568 #ifndef U_FORCE_HIDE_DEPRECATED_API
569 
595  virtual RuleBasedBreakIterator *createBufferClone(void *stackBuffer,
596  int32_t &BufferSize,
597  UErrorCode &status) override;
598 #endif // U_FORCE_HIDE_DEPRECATED_API
599 
617  virtual const uint8_t *getBinaryRules(uint32_t &length);
618 
644  virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status) override;
645 
646 
647 private:
648  //=======================================================================
649  // implementation
650  //=======================================================================
656  void reset(void);
657 
662  void init(UErrorCode &status);
663 
673  int32_t handleSafePrevious(int32_t fromPosition);
674 
687  int32_t handleNext();
688 
689  /*
690  * Templatized version of handleNext() and handleSafePrevious().
691  *
692  * There will be exactly four instantiations, two each for 8 and 16 bit tables,
693  * two each for 8 and 16 bit trie.
694  * Having separate instantiations for the table types keeps conditional tests of
695  * the table type out of the inner loops, at the expense of replicated code.
696  *
697  * The template parameter for the Trie access function is a value, not a type.
698  * Doing it this way, the compiler will inline the Trie function in the
699  * expanded functions. (Both the 8 and 16 bit access functions have the same type
700  * signature)
701  */
702 
703  typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
704 
705  template<typename RowType, PTrieFunc trieFunc>
706  int32_t handleSafePrevious(int32_t fromPosition);
707 
708  template<typename RowType, PTrieFunc trieFunc>
709  int32_t handleNext();
710 
711 
718  const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
719 
720  public:
721 #ifndef U_HIDE_INTERNAL_API
722 
726  void dumpCache();
727 
732  void dumpTables();
733 #endif /* U_HIDE_INTERNAL_API */
734 };
735 
736 //------------------------------------------------------------------------------
737 //
738 // Inline Functions Definitions ...
739 //
740 //------------------------------------------------------------------------------
741 
742 inline bool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {
743  return !operator==(that);
744 }
745 
746 U_NAMESPACE_END
747 
748 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
749 
750 #endif /* U_SHOW_CPLUSPLUS_API */
751 
752 #endif
C++ API: Break Iterator.
bool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:335
BreakIterator & operator=(const BreakIterator &other)
RBBIDataWrapper * fData
The rule data for this BreakIterator instance.
Definition: rbbi.h:74
virtual int32_t next(void)=0
Advance the iterator to the boundary following the current boundary.
virtual UBool isBoundary(int32_t offset)=0
Return true if the specified position is a boundary position.
virtual void adoptText(CharacterIterator *it)=0
Change the text over which this operates.
virtual bool operator==(const BreakIterator &) const =0
Return true if another object is semantically equal to this one.
Immutable Unicode code point trie structure.
Definition: ucptrie.h:60
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
void * UClassID
UClassID is used to identify classes without using the compiler&#39;s RTTI.
Definition: uobject.h:96
virtual CharacterIterator & getText(void) const =0
Return a CharacterIterator over the text being analyzed.
virtual UText * getUText(UText *fillIn, UErrorCode &status) const =0
Get a UText for the text being analyzed.
virtual int32_t first(void)=0
Sets the current iteration position to the beginning of the text, position zero.
virtual int32_t following(int32_t offset)=0
Advance the iterator to the first boundary following the specified offset.
bool operator!=(const BreakIterator &rhs) const
Returns the complement of the result of operator==.
Definition: brkiter.h:135
Abstract class that defines an API for iteration on text objects.
Definition: chariter.h:361
C++ API: String Character Iterator.
A concrete subclass of CharacterIterator that iterates over the characters (code units or code points...
Definition: schriter.h:48
The BreakIterator class implements methods for finding the location of boundaries in text...
Definition: brkiter.h:106
virtual int32_t last(void)=0
Set the iterator position to the index immediately BEYOND the last character in the text being scanne...
virtual int32_t current(void) const =0
Return character index of the current iterator position within the text.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:467
virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status)
For RuleBasedBreakIterators, get the status (tag) values from the break rule(s) that determined the b...
C API: Data loading interface.
struct UDataMemory UDataMemory
Forward declaration of the data memory type.
Definition: udata.h:161
virtual int32_t previous(void)=0
Set the iterator position to the boundary preceding the current boundary.
virtual UClassID getDynamicClassID(void) const override=0
Return a polymorphic class ID for this object.
virtual void setText(const UnicodeString &text)=0
Change the text over which this operates.
virtual BreakIterator & refreshInputText(UText *input, UErrorCode &status)=0
Set the subject text string upon which the break iterator is operating without changing any other asp...
C API: Parse Error Information.
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
virtual BreakIterator * createBufferClone(void *stackBuffer, int32_t &BufferSize, UErrorCode &status)=0
Deprecated functionality.
virtual int32_t getRuleStatus() const
For RuleBasedBreakIterators, return the status tag from the break rule that determined the boundary a...
UText struct.
Definition: utext.h:1328
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:57
virtual int32_t preceding(int32_t offset)=0
Set the iterator position to the first boundary preceding the specified offset.
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:58
Basic definitions for ICU, for both C and C++ APIs.
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:300
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:295
virtual BreakIterator * clone() const =0
Return a polymorphic copy of this object.
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition: umachine.h:269