LCOV - code coverage report
Current view: top level - src - pgn_lexer.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 166 186 89.2 %
Date: 2019-01-29 11:06:41 Functions: 26 39 66.7 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (C) 2018  Fulvio Benini.
       3             :  *
       4             :  * Permission is hereby granted, free of charge, to any person obtaining a
       5             :  * copy of this software and associated documentation files (the "Software"),
       6             :  * to deal in the Software without restriction, including without limitation
       7             :  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
       8             :  * and/or sell copies of the Software, and to permit persons to whom the
       9             :  * Software is furnished to do so, subject to the following conditions:
      10             :  *
      11             :  * The above copyright notice and this permission notice shall be included
      12             :  * in all copies or substantial portions of the Software.
      13             :  *
      14             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
      15             :  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      16             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      17             :  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      18             :  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
      19             :  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH
      20             :  * THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      21             :  */
      22             : 
      23             : /** @file
      24             :  * Split input into PGN tokens and dispatch them to a "visiting" parser.
      25             :  */
      26             : 
      27             : #ifndef _PGN_LEXER_H
      28             : #define _PGN_LEXER_H
      29             : 
      30             : #include <algorithm>
      31             : #include <cassert>
      32             : 
      33             : namespace pgn_impl {
      34             : // "PGN character data is organized as tokens. A token is a contiguous
      35             : // sequence of characters that represents a basic semantic unit. Tokens
      36             : // may be separated from adjacent tokens by white space characters.
      37             : // (White space characters include space, newline, and tab characters.)
      38             : // Some tokens are self delimiting and do not require white space
      39             : // characters."
      40             : 
      41             : /**
      42             :  * Creates a 128 bits bitmap of PGN symbol characters.
      43             :  *
      44             :  * "A symbol token starts with a letter or digit character and is immediately
      45             :  * followed by a sequence of zero or more symbol continuation characters.
      46             :  * These continuation characters are letter characters ("A-Za-z"), digit
      47             :  * characters ("0-9"), the underscore ("_"), the plus sign ("+"), the octothorpe
      48             :  * sign ("#"), the equal sign ("="), the colon (":"),  and the hyphen ("-")."
      49             :  * @param elem: 0 for the lower 64 bits, 1 for the higher 64 bits.
      50             :  * @returns the requested half of the bitmap.
      51             :  */
      52             : constexpr unsigned long long init_symbol_map(unsigned elem) {
      53             :         return (elem == 0) ? 0x27ffb80800000000 : 0x47fffffe87ffffff;
      54             : 
      55             :         /* Requires gcc >= 6.2 or clang >= 3.5
      56             : 
      57             :            unsigned long long res[2] = {0};
      58             :            for (unsigned ch = 'A'; ch <= 'Z'; ++ch) {
      59             :                res[ch / 64] |= (1ULL << (ch % 64));
      60             :            }
      61             :            for (unsigned ch = 'a'; ch <= 'z'; ++ch) {
      62             :                res[ch / 64] |= (1ULL << (ch % 64));
      63             :            }
      64             :            for (unsigned ch = '0'; ch <= '9'; ++ch) {
      65             :                res[ch / 64] |= (1ULL << (ch % 64));
      66             :            }
      67             :            const unsigned extra[] = {'_', '+', '#', '=', ':', '-'};
      68             :            for (unsigned ch : extra) {
      69             :                res[ch / 64] |= (1ULL << (ch % 64));
      70             :            }
      71             :            const unsigned drawresult_unclear[] = {'/', '~'};
      72             :            for (unsigned ch : drawresult_unclear) {
      73             :                res[ch / 64] |= (1ULL << (ch % 64));
      74             :            }
      75             :            const unsigned chess_variants[] = {',', '@'};
      76             :            for (unsigned ch : chess_variants) {
      77             :                res[ch / 64] |= (1ULL << (ch % 64));
      78             :            }
      79             :            return res[elem];
      80             :    */
      81             : }
      82             : 
      83             : /**
      84             :  * Checks if the given character is a PGN symbol.
      85             :  * @param ch: character to classify.
      86             :  * @returns true if @e ch is a PGN symbol character, false otherwise.
      87             :  */
      88     7559779 : inline bool is_PGNsymbol(unsigned char ch) {
      89     7559779 :         constexpr unsigned long long tok_map[] = {init_symbol_map(0),
      90             :                                                   init_symbol_map(1)};
      91     7559779 :         auto high = ch / 64;
      92     7559779 :         auto low = ch % 64;
      93     7559779 :         return high > 1 ? false : tok_map[high] & (1ULL << low);
      94             : }
      95             : 
      96             : /**
      97             :  * Checks if the given character is one of the 10 decimal digits: 0123456789.
      98             :  * @param ch: character to classify.
      99             :  * @returns true if the character is a numeric character, false otherwise.
     100             :  */
     101     4248015 : inline bool is_PGNdigit(unsigned char ch) { return ch >= '0' && ch <= '9'; }
     102             : 
     103             : /**
     104             :  * Checks if the given character is a white space ("white space characters
     105             :  * include space, newline, and tab characters").
     106             :  * @param ch: character to classify.
     107             :  * @returns true if the character is a white space, false otherwise.
     108             :  */
     109       45603 : inline bool is_PGNwhitespace(unsigned char ch) {
     110       45603 :         return (ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t' || ch == '\v');
     111             : }
     112             : 
     113             : /**
     114             :  * Checks if a token is the game termination marker.
     115             :  *
     116             :  * "The game termination marker is a symbol that is one of the following four
     117             :  * values: "1-0" (White wins), "0-1" (Black wins), "1/2-1/2" (drawn game),
     118             :  * and "*" (game in progress, result unknown, or game abandoned)."
     119             :  * @param tok: the token to classify.
     120             :  * @returns
     121             :  * - '1' for "White wins",
     122             :  * - '0' for "Black wins",
     123             :  * - '/' for "drawn game",
     124             :  * - '*' for "result unknown",
     125             :  * - 0 (false) if it's not a termination marker.
     126             :  */
     127          33 : template <typename TView> char is_PGNtermination(TView tok) {
     128          33 :         auto n_chars = std::distance(tok.first, tok.second);
     129          33 :         if (n_chars == 3) {
     130          33 :                 if (std::equal(tok.first, tok.first + 3, "1-0"))
     131           9 :                         return '1';
     132          24 :                 if (std::equal(tok.first, tok.first + 3, "0-1"))
     133          24 :                         return '0';
     134           0 :                 if (std::equal(tok.first, tok.first + 3, "1/2"))
     135           0 :                         return '/';
     136           0 :                 if (std::equal(tok.first, tok.first + 3, "1:0"))
     137           0 :                         return '1';
     138           0 :                 if (std::equal(tok.first, tok.first + 3, "0:1"))
     139           0 :                         return '0';
     140           0 :         } else if (n_chars == 7) {
     141           0 :                 if (std::equal(tok.first, tok.first + 7, "1/2-1/2") ||
     142           0 :                     std::equal(tok.first, tok.first + 7, "1/2:1/2"))
     143           0 :                         return '/';
     144             :         }
     145           0 :         return 0;
     146             : }
     147             : 
     148             : /**
     149             :  * Read a token and dispatch it to a PGN parser.
     150             :  * The first char of the token is used to determine its termination.
     151             :  * @param ch:      the first char of the token.
     152             :  * @param input:   the input to get data from.
     153             :  * @param parser:  will receive the tokens via visitPGN_* functions.
     154             :  * @param section: -1 pregame, 0 for tag pair section, 1 for movetext section.
     155             :  * @returns the result of the invoked parser's function.
     156             :  */
     157             : template <typename TInput, typename TVisitor>
     158     8013049 : bool parse_token(char ch, TInput& input, TVisitor& parser, int& section) {
     159     8013049 :         switch (ch) {
     160     2389563 :         case ' ':  // self terminating
     161             :         case '\t': // self terminating
     162             :         case '\v': // self terminating
     163             :         case '\r': // self terminating
     164     2389563 :                 return true;
     165             : 
     166      129953 :         case '\n': // self terminating
     167      129953 :                 return parser.visitPGN_EndOfLine();
     168             : 
     169     1909592 :         case '.': // self terminating
     170     1909592 :                 return true;
     171             : 
     172           0 :         case '<': // self terminating
     173           0 :                 return true;
     174             : 
     175           0 :         case '>': // self terminating
     176           0 :                 return true;
     177             : 
     178        2001 :         case '*': // self terminating
     179        2001 :                 return parser.visitPGN_ResultFinal('*');
     180             : 
     181      309108 :         case '(': // self terminating
     182      309108 :                 return parser.visitPGN_VariationStart();
     183             : 
     184      309108 :         case ')': // self terminating
     185      309108 :                 return parser.visitPGN_VariationEnd();
     186             : 
     187       15146 :         case '[': // --> ']', can span multiple lines
     188       15146 :                 if (section <= 0) {
     189       15144 :                         section = 0;
     190       45432 :                         auto skip_spaces = [&]() {
     191       30288 :                                 auto spaces = input.read_while(is_PGNwhitespace);
     192       60610 :                                 while (spaces.first != spaces.second) {
     193       15161 :                                         if (*spaces.first++ == '\n')
     194           9 :                                                 parser.visitPGN_EndOfLine();
     195             :                                 }
     196       30288 :                         };
     197       15144 :                         skip_spaces();
     198       15144 :                         auto tag = input.read_while(is_PGNsymbol);
     199             : 
     200       15144 :                         skip_spaces();
     201       15144 :                         auto value = input.read_until(']');
     202             : 
     203             :                         // Remove the " char at the start and deal with the special case of
     204             :                         // a ] char inside the string token.
     205       15144 :                         if (value.first != value.second && *value.first == '"') {
     206       15167 :                                 auto is_terminated = [&]() {
     207         338 :                                         for (auto it = value.first; it != value.second; ++it) {
     208         303 :                                                 if (*it == '"')
     209           2 :                                                         return true;
     210         304 :                                                 if (*it == '\\' && ++it == value.second)
     211           0 :                                                         break;
     212             :                                         }
     213          32 :                                         return false;
     214             :                                 };
     215       15133 :                                 ++value.first;
     216       15197 :                                 while (!input.last_column() && !is_terminated()) {
     217          32 :                                         value.second = input.read_until(']').second;
     218             :                                 }
     219             :                         }
     220             :                         // trim right
     221       15182 :                         while (value.first != value.second) {
     222       15163 :                                 auto last_ch = *--value.second;
     223       15163 :                                 if (last_ch == '"') {
     224       15135 :                                         break;
     225             :                                 }
     226          28 :                                 if (!is_PGNwhitespace(last_ch)) {
     227           9 :                                         ++value.second;
     228           9 :                                         break;
     229             :                                 }
     230          19 :                                 if (last_ch == '\n')
     231           3 :                                         parser.visitPGN_EndOfLine();
     232             :                         }
     233       15144 :                         return parser.visitPGN_TagPair(tag, value);
     234             :                 }
     235           2 :                 input.sungetc();
     236           2 :                 parser.visitPGN_inputUnexpectedPGNHeader();
     237           2 :                 return false;
     238             : 
     239      224028 :         case '{': // --> '}', can span multiple lines
     240      224028 :                 return parser.visitPGN_Comment(input.read_until('}'));
     241             : 
     242           4 :         case ';': // --> '\n'
     243           4 :                 return parser.visitPGN_Comment(input.read_line());
     244             : 
     245           1 :         case '%': // --> '\n', only if "appearing in the first column of a line"
     246           1 :                 if (input.first_column()) {
     247           1 :                         return parser.visitPGN_Escape(input.read_line());
     248             :                 }
     249             :                 return parser.visitPGN_Unknown(
     250           0 :                     input.read_token([](char c) { return c == '%'; }));
     251             : 
     252         384 :         case '$': // terminated just prior to the first non-digit character
     253         384 :                 return parser.visitPGN_NAG(input.read_token(is_PGNdigit));
     254             : 
     255           0 :         case '?': // Suffix annotations: "!", "?", "!!", "!?", "?!", and "??"
     256             :         case '!': // "At most one such suffix annotation may appear per move"
     257             :                 return parser.visitPGN_Suffix(
     258           0 :                     input.read_token([](char c) { return c == '!' || c == '?'; }));
     259             :         }
     260             : 
     261             :         // "A symbol token is terminated just prior to the first non-symbol
     262             :         // character following the symbol character sequence."
     263     2724161 :         auto tok = input.read_token(is_PGNsymbol);
     264     2724161 :         bool epd = (section < 0 && std::count(tok.first, tok.second, '/') == 7);
     265     2724161 :         section = 1;
     266             : 
     267     2724161 :         if (epd) {
     268           4 :                 tok.second = input.read_line().second;
     269           4 :                 parser.visitPGN_EPD(tok);
     270           4 :                 return false;
     271             :         }
     272             : 
     273     2724157 :         auto notdigit = std::find_if_not(tok.first, tok.second, is_PGNdigit);
     274     2724157 :         if (notdigit == tok.first)
     275     1565565 :                 return parser.visitPGN_SANMove(tok);
     276             : 
     277     1158592 :         if (notdigit == tok.second)
     278     1158559 :                 return parser.visitPGN_MoveNum(tok);
     279             : 
     280          33 :         if (auto result = is_PGNtermination(tok))
     281          33 :                 return parser.visitPGN_ResultFinal(result);
     282             : 
     283           0 :         return parser.visitPGN_Unknown(tok);
     284             : }
     285             : 
     286             : class InputMemory {
     287             :         const char* const begin_;
     288             :         const char* const end_;
     289             :         const char* it_;
     290             : 
     291             : public:
     292        2070 :         InputMemory(const char* begin, const char* end)
     293        2070 :             : begin_(begin), end_(end), it_(begin) {}
     294             : 
     295             :         /// Reads one character and advances the input sequence by one character.
     296     8013049 :         char sbumpc() {
     297     8013049 :                 assert(it_ != end_);
     298     8013049 :                 return *it_++;
     299             :         };
     300             : 
     301             :         /// Makes the most recently extracted character available again.
     302           2 :         void sungetc() {
     303           2 :                 assert(it_ != begin_ && it_ != end_);
     304           2 :                 --it_;
     305           2 :         }
     306             : 
     307             :         /// Returns true if there are no chars available.
     308     8028244 :         bool eof() const { return it_ == end_; };
     309             : 
     310             :         /// Returns the number of chars read.
     311        2071 :         std::size_t n_read() const { return std::distance(begin_, it_); }
     312             : 
     313             :         /// Returns true if the most recently extracted character was the first
     314             :         /// character of the line.
     315           1 :         bool first_column() const { return (n_read() < 2 || *(it_ - 2) == '\n'); };
     316             : 
     317             :         /// Returns true if the most recently extracted character was the last
     318             :         /// character of the line.
     319       15165 :         bool last_column() const { return eof() || *it_ == '\n' || *it_ == '\r'; }
     320             : 
     321             :         /// Returns the range of chars: [curr_char, '\n').
     322             :         /// The '\n' char is left as the next character to extract.
     323           9 :         std::pair<const char*, const char*> read_line() {
     324           9 :                 auto first = it_;
     325           9 :                 it_ = std::find(it_, end_, '\n');
     326           9 :                 return {first, it_};
     327             :         }
     328             : 
     329             :         /// Returns the range of chars: [curr_char, delim).
     330             :         /// The delim char is skipped.
     331      239204 :         std::pair<const char*, const char*> read_until(char delim) {
     332      239204 :                 auto first = it_;
     333      239204 :                 it_ = std::find(it_, end_, delim);
     334      239204 :                 auto second = (it_ == end_) ? it_ : it_++;
     335      239204 :                 return {first, second};
     336             :         }
     337             : 
     338             :         /// Returns the range of chars: [curr_char, cond == true].
     339             :         template <typename Cond>
     340       45432 :         std::pair<const char*, const char*> read_while(Cond cond) {
     341       45432 :                 auto first = it_;
     342       45432 :                 it_ = std::find_if_not(it_, end_, cond);
     343       45432 :                 return {first, it_};
     344             :         }
     345             : 
     346             :         /// Returns the range of chars: [last_extracted_char, cond == true].
     347             :         /// cond is not applied to last_extracted_char.
     348             :         template <typename Cond>
     349     2724545 :         std::pair<const char*, const char*> read_token(Cond cond) {
     350     2724545 :                 assert(it_ != begin_);
     351     2724545 :                 auto first = it_ - 1;
     352     2724545 :                 it_ = std::find_if_not(it_, end_, cond);
     353     2724545 :                 return {first, it_};
     354             :         }
     355             : };
     356             : 
     357             : } // namespace pgn_impl
     358             : 
     359             : namespace pgn {
     360             : 
     361             : /**
     362             :  * Read a PGN game from memory, grouping characters in tokens and dispatching
     363             :  * them to a PGN parser.
     364             :  * @param input:  the memory range containing the PGN game.
     365             :  * @param parser: will receive the tokens via visitPGN_* functions.
     366             :  *                Parsing is aborted if it returns false.
     367             :  * @returns a std::pair containing the number of chars parsed, and true if at
     368             :  * least a tag-pair token or a symbol token was dispatched.
     369             :  */
     370             : template <typename TVisitor>
     371        2070 : std::pair<std::size_t, bool> parse_game(pgn_impl::InputMemory input,
     372             :                                         TVisitor&& parser) {
     373        2070 :         int section = -1;
     374     8011009 :         do {
     375     8013079 :                 if (input.eof()) {
     376          30 :                         if (section >= 0)
     377          12 :                                 parser.visitPGN_inputEOF();
     378          30 :                         break;
     379             :                 }
     380     8013049 :         } while (pgn_impl::parse_token(input.sbumpc(), input, parser, section));
     381             : 
     382        2070 :         return {input.n_read(), section >= 0};
     383             : }
     384             : 
     385             : /**
     386             :  * Normalize white spaces and converts Latin-1 chars to UTF-8 sequences.
     387             :  *
     388             :  * The original PGN standard used a subset of ISO 8859/1 (Latin 1):
     389             :  * "Code value from 0 to 126 are the standard ASCII character set."
     390             :  * "Code value from 127 to 191 are not used for PGN data representation."
     391             :  * "Code value from 192 to 255 are mostly alphabetic printing characters with
     392             :  * various diacritical marks; their use is encouraged for those languages
     393             :  * that require such characters."
     394             :  * However this do not allow internationalization for comments and names
     395             :  * (players, sites, etc...); the common UTF-8 is a superior alternative.
     396             :  * @param unescape: if true converts \\ to \ and \" to ".
     397             :  * @param str: the string to be normalized.
     398             :  * @param pos: start of the substring of @e str that will be normalized.
     399             :  * @returns the number of '\n' chars in @e str.
     400             :  */
     401             : template <bool unescape = false, typename TString>
     402      235002 : std::size_t normalize(TString& str, std::size_t pos) {
     403      235002 :         std::size_t n_newlines = 0;
     404    33953079 :         for (std::size_t i = pos, n = str.size(); i < n; ++i) {
     405    33718077 :                 unsigned char ch = str[i];
     406             :                 // An invalid UTF-8 sequence is considered a Latin1 char and converted.
     407    33718077 :                 if (ch > 0xBF) {
     408         772 :                         unsigned char nxt = (i + 1 != n) ? str[i + 1] : 0;
     409         772 :                         if (nxt < 0x80 || nxt > 0xBF) {
     410         231 :                                 str[i] = static_cast<unsigned char>(ch & 0xBF);
     411         231 :                                 str.insert(str.begin() + i, static_cast<unsigned char>(0xC3));
     412         231 :                                 ++i;
     413         231 :                                 ++n;
     414             :                         }
     415    33717305 :                 } else if (ch == '\n' || ch == '\r' || ch == '\t' || ch == '\v') {
     416         800 :                         if (ch == '\n')
     417         725 :                                 ++n_newlines;
     418             : 
     419             :                         // Tab and new line characters are removed if there is an adjacent
     420             :                         // space, or converted to a normal space otherwise.
     421        2390 :                         if (i == pos ||                             // First char
     422        1570 :                             (i + 1) == n ||                         // Last char
     423        1706 :                             str[i - 1] == ' ' ||                    // Preceded by a space
     424         126 :                             pgn_impl::is_PGNwhitespace(str[i + 1])) // Followed by a space
     425             :                         {
     426         699 :                                 str.erase(i, 1);
     427         699 :                                 --i;
     428         699 :                                 --n;
     429             :                         } else {
     430         101 :                                 str[i] = ' ';
     431             :                         }
     432       61056 :                 } else if (unescape && ch == '\\' && i + 1 != n) {
     433             :                         // "A quote inside a string is represented by the backslash
     434             :                         // immediately followed by a quote. A backslash inside a string is
     435             :                         // represented by two adjacent backslashes."
     436          32 :                         if (str[i + 1] == '\\' || str[i + 1] == '"') {
     437           5 :                                 str.erase(i, 1);
     438           5 :                                 --n;
     439             :                         }
     440             :                 }
     441             :         }
     442      235002 :         return n_newlines;
     443             : }
     444             : 
     445             : /**
     446             :  * Escape quote and backslash chars according to the PGN standard:
     447             :  * "A quote inside a string is represented by the backslash immediately followed
     448             :  * by a quote. A backslash inside a string is represented by two adjacent
     449             :  * backslashes."
     450             :  * @param str: the string containing the chars to be escaped.
     451             :  * @param pos: start of the substring of @e str to be processed.
     452             :  */
     453             : template <typename TString> void escape_string(TString& str, std::size_t pos) {
     454             :         auto it = str.begin() + pos;
     455             :         while (true) {
     456             :                 it = std::find_if(it, str.end(),
     457             :                                   [](char ch) { return ch == '\\' || ch == '\"'; });
     458             :                 if (it != str.end())
     459             :                         it = str.insert(it, '\\') + 2;
     460             :                 else
     461             :                         break;
     462             :         }
     463             : }
     464             : 
     465             : /**
     466             :  * Trim leading and trailing white spaces.
     467             :  * @param str: the string to trim.
     468             :  * @returns the number of '\n' chars in @e str.
     469             :  */
     470      224042 : template <typename TView> std::size_t trim(TView& str) {
     471      224042 :         std::size_t n_newlines = 0;
     472     1398747 :         auto is_space = [&n_newlines](char ch) {
     473      811412 :                 if (ch == '\n') {
     474      363293 :                         ++n_newlines;
     475      448119 :                 } else if (ch != ' ' && ch != '\r' && ch != '\t' && ch != '\v') {
     476      448082 :                         return false;
     477             :                 }
     478      363330 :                 return true;
     479             :         };
     480      224042 :         str.first = std::find_if_not(str.first, str.second, is_space);
     481             : 
     482             :         using RevIt = std::reverse_iterator<decltype(str.first)>;
     483      224042 :         str.second =
     484             :             std::find_if_not(RevIt(str.second), RevIt(str.first), is_space).base();
     485             : 
     486      224042 :         return n_newlines;
     487             : }
     488             : 
     489             : } // namespace pgn
     490             : 
     491             : #endif // _PGN_LEXER_H

Generated by: LCOV version 1.13