LCOV - code coverage report
Current view: top level - src - pgn_lexer.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 154 174 88.5 %
Date: 2018-02-05 16:49:44 Functions: 26 39 66.7 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (C) 2018  Fulvio Benini.
       3             :  *
       4             :  * Permission is hereby granted, free of charge, to any person obtaining a
       5             :  * copy of this software and associated documentation files (the "Software"),
       6             :  * to deal in the Software without restriction, including without limitation
       7             :  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
       8             :  * and/or sell copies of the Software, and to permit persons to whom the
       9             :  * Software is furnished to do so, subject to the following conditions:
      10             :  *
      11             :  * The above copyright notice and this permission notice shall be included
      12             :  * in all copies or substantial portions of the Software.
      13             :  *
      14             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
      15             :  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      16             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      17             :  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      18             :  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
      19             :  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH
      20             :  * THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      21             :  */
      22             : 
      23             : /** @file
      24             :  * Split input into PGN tokens and dispatch them to a "visiting" parser.
      25             :  */
      26             : 
      27             : #ifndef _PGN_LEXER_H
      28             : #define _PGN_LEXER_H
      29             : 
      30             : #include <algorithm>
      31             : #include <cassert>
      32             : 
      33             : namespace pgn_impl {
      34             : // "PGN character data is organized as tokens. A token is a contiguous
      35             : // sequence of characters that represents a basic semantic unit. Tokens
      36             : // may be separated from adjacent tokens by white space characters.
      37             : // (White space characters include space, newline, and tab characters.)
      38             : // Some tokens are self delimiting and do not require white space
      39             : // characters."
      40             : 
      41             : /**
      42             :  * Creates a 128 bits bitmap of PGN symbol characters.
      43             :  *
      44             :  * "A symbol token starts with a letter or digit character and is immediately
      45             :  * followed by a sequence of zero or more symbol continuation characters.
      46             :  * These continuation characters are letter characters ("A-Za-z"), digit
      47             :  * characters ("0-9"), the underscore ("_"), the plus sign ("+"), the octothorpe
      48             :  * sign ("#"), the equal sign ("="), the colon (":"),  and the hyphen ("-")."
      49             :  * @param elem: 0 for the lower 64 bits, 1 for the higher 64 bits.
      50             :  * @returns the requested half of the bitmap.
      51             :  */
      52             : constexpr unsigned long long init_symbol_map(unsigned elem) {
      53             :         return (elem == 0) ? 0x27ffb80800000000 : 0x47fffffe87ffffff;
      54             : 
      55             :         /* Requires gcc >= 6.2 or clang >= 3.5
      56             : 
      57             :            unsigned long long res[2] = {0};
      58             :            for (unsigned ch = 'A'; ch <= 'Z'; ++ch) {
      59             :                res[ch / 64] |= (1ULL << (ch % 64));
      60             :            }
      61             :            for (unsigned ch = 'a'; ch <= 'z'; ++ch) {
      62             :                res[ch / 64] |= (1ULL << (ch % 64));
      63             :            }
      64             :            for (unsigned ch = '0'; ch <= '9'; ++ch) {
      65             :                res[ch / 64] |= (1ULL << (ch % 64));
      66             :            }
      67             :            const unsigned extra[] = {'_', '+', '#', '=', ':', '-'};
      68             :            for (unsigned ch : extra) {
      69             :                res[ch / 64] |= (1ULL << (ch % 64));
      70             :            }
      71             :            const unsigned drawresult_unclear[] = {'/', '~'};
      72             :            for (unsigned ch : drawresult_unclear) {
      73             :                res[ch / 64] |= (1ULL << (ch % 64));
      74             :            }
      75             :            const unsigned chess_variants[] = {',', '@'};
      76             :            for (unsigned ch : chess_variants) {
      77             :                res[ch / 64] |= (1ULL << (ch % 64));
      78             :            }
      79             :            return res[elem];
      80             :    */
      81             : }
      82             : 
      83             : /**
      84             :  * Checks if the given character is a PGN symbol.
      85             :  * @param ch: character to classify.
      86             :  * @returns true if @e ch is a PGN symbol character, false otherwise.
      87             :  */
      88     7559447 : inline bool is_PGNsymbol(unsigned ch) {
      89     7559447 :         constexpr unsigned long long tok_map[] = {init_symbol_map(0),
      90             :                                                   init_symbol_map(1)};
      91     7559447 :         unsigned high = ch / 64;
      92     7559447 :         unsigned low = ch % 64;
      93     7559447 :         return high > 1 ? false : tok_map[high] & (1ULL << low);
      94             : }
      95             : 
      96             : /**
      97             :  * Checks if the given character is one of the 10 decimal digits: 0123456789.
      98             :  * @param ch: character to classify.
      99             :  * @returns true if the character is a numeric character, false otherwise.
     100             :  */
     101     4248013 : inline bool is_PGNdigit(unsigned char ch) { return ch >= '0' && ch <= '9'; }
     102             : 
     103             : /**
     104             :  * Checks if the given character is a white space ("white space characters
     105             :  * include space, newline, and tab characters").
     106             :  * @param ch: character to classify.
     107             :  * @returns true if the character is a white space, false otherwise.
     108             :  */
     109       53006 : inline bool is_PGNwhitespace(unsigned char ch) {
     110       53006 :         return (ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t' || ch == '\v');
     111             : }
     112             : 
     113             : /**
     114             :  * Checks if a token is the game termination marker.
     115             :  *
     116             :  * "The game termination marker is a symbol that is one of the following four
     117             :  * values: "1-0" (White wins), "0-1" (Black wins), "1/2-1/2" (drawn game),
     118             :  * and "*" (game in progress, result unknown, or game abandoned)."
     119             :  * @param tok: the token to classify.
     120             :  * @returns
     121             :  * - '1' for "White wins",
     122             :  * - '0' for "Black wins",
     123             :  * - '/' for "drawn game",
     124             :  * - '*' for "result unknown",
     125             :  * - 0 (false) if it's not a termination marker.
     126             :  */
     127          32 : template <typename TView> char is_PGNtermination(TView tok) {
     128          32 :         auto n_chars = std::distance(tok.first, tok.second);
     129          32 :         if (n_chars == 3) {
     130          32 :                 if (std::equal(tok.first, tok.first + 3, "1-0"))
     131           9 :                         return '1';
     132          23 :                 if (std::equal(tok.first, tok.first + 3, "0-1"))
     133          23 :                         return '0';
     134           0 :                 if (std::equal(tok.first, tok.first + 3, "1/2"))
     135           0 :                         return '/';
     136           0 :                 if (std::equal(tok.first, tok.first + 3, "1:0"))
     137           0 :                         return '1';
     138           0 :                 if (std::equal(tok.first, tok.first + 3, "0:1"))
     139           0 :                         return '0';
     140           0 :         } else if (n_chars == 7) {
     141           0 :                 if (std::equal(tok.first, tok.first + 7, "1/2-1/2") ||
     142           0 :                     std::equal(tok.first, tok.first + 7, "1/2:1/2"))
     143           0 :                         return '/';
     144             :         }
     145           0 :         return 0;
     146             : }
     147             : 
     148             : /**
     149             :  * Read a token and dispatch it to a PGN parser.
     150             :  * The first char of the token is used to determine its termination.
     151             :  * @param ch:      the first char of the token.
     152             :  * @param input:   the input to get data from.
     153             :  * @param parser:  will receive the tokens via visitPGN_* functions.
     154             :  * @param section: -1 pregame, 0 for tag pair section, 1 for movetext section.
     155             :  * @returns the result of the invoked parser's function.
     156             :  */
     157             : template <typename TInput, typename TVisitor>
     158     8012975 : bool parse_token(char ch, TInput& input, TVisitor& parser, int& section) {
     159     8012975 :         switch (ch) {
     160     2389563 :         case ' ':  // self terminating
     161             :         case '\t': // self terminating
     162             :         case '\v': // self terminating
     163             :         case '\r': // self terminating
     164     2389563 :                 return true;
     165             : 
     166      129916 :         case '\n': // self terminating
     167      129916 :                 return parser.visitPGN_EndOfLine();
     168             : 
     169     1909592 :         case '.': // self terminating
     170     1909592 :                 return true;
     171             : 
     172           0 :         case '<': // self terminating
     173           0 :                 return true;
     174             : 
     175           0 :         case '>': // self terminating
     176           0 :                 return true;
     177             : 
     178        2001 :         case '*': // self terminating
     179        2001 :                 parser.visitPGN_ResultFinal('*');
     180        2001 :                 return false;
     181             : 
     182      309108 :         case '(': // self terminating
     183      309108 :                 return parser.visitPGN_VariationStart();
     184             : 
     185      309108 :         case ')': // self terminating
     186      309108 :                 return parser.visitPGN_VariationEnd();
     187             : 
     188       15110 :         case '[': // --> ']', can span multiple lines
     189       15110 :                 if (section <= 0) {
     190       15108 :                         section = 0;
     191       45324 :                         auto skip_spaces = [&]() {
     192       30216 :                                 auto spaces = input.read_while(is_PGNwhitespace);
     193       60466 :                                 while (spaces.first != spaces.second) {
     194       15125 :                                         if (*spaces.first++ == '\n')
     195           9 :                                                 parser.visitPGN_EndOfLine();
     196             :                                 }
     197       30216 :                         };
     198       15108 :                         skip_spaces();
     199       15108 :                         auto tag = input.read_while(is_PGNsymbol);
     200             : 
     201       15108 :                         skip_spaces();
     202       15108 :                         auto value = input.read_until(']');
     203             : 
     204             :                         // Remove the " char at the start and deal with the special case of
     205             :                         // a ] char inside the string token.
     206       15108 :                         if (value.first != value.second && *value.first == '"') {
     207       15156 :                                 auto is_terminated = [&]() {
     208         536 :                                         for (auto it = value.first; it != value.second; ++it) {
     209         450 :                                                 if (*it == '"')
     210           2 :                                                         return true;
     211         477 :                                                 if (*it == '\\' && ++it == value.second)
     212           0 :                                                         break;
     213             :                                         }
     214          57 :                                         return false;
     215             :                                 };
     216       15097 :                                 ++value.first;
     217       15211 :                                 while (!input.last_column() && !is_terminated()) {
     218          57 :                                         value.second = input.read_until(']').second;
     219             :                                 }
     220             :                         }
     221             :                         // trim right
     222       15146 :                         while (value.first != value.second) {
     223       15127 :                                 auto last_ch = *--value.second;
     224       15127 :                                 if (last_ch == '"') {
     225       15099 :                                         break;
     226             :                                 }
     227          28 :                                 if (!is_PGNwhitespace(last_ch)) {
     228           9 :                                         ++value.second;
     229           9 :                                         break;
     230             :                                 }
     231          19 :                                 if (last_ch == '\n')
     232           3 :                                         parser.visitPGN_EndOfLine();
     233             :                         }
     234       15108 :                         return parser.visitPGN_TagPair(tag, value);
     235             :                 }
     236           2 :                 input.sungetc();
     237           2 :                 parser.visitPGN_inputUnexpectedPGNHeader();
     238           2 :                 return false;
     239             : 
     240      224028 :         case '{': // --> '}', can span multiple lines
     241      224028 :                 return parser.visitPGN_Comment(input.read_until('}'));
     242             : 
     243           4 :         case ';': // --> '\n'
     244           4 :                 return parser.visitPGN_Comment(input.read_line());
     245             : 
     246           1 :         case '%': // --> '\n', only if "appearing in the first column of a line"
     247           1 :                 if (input.first_column()) {
     248           1 :                         return parser.visitPGN_Escape(input.read_line());
     249             :                 }
     250             :                 return parser.visitPGN_Unknown(
     251           0 :                     input.read_token([](char c) { return c == '%'; }));
     252             : 
     253         384 :         case '$': // terminated just prior to the first non-digit character
     254         384 :                 return parser.visitPGN_NAG(input.read_token(is_PGNdigit));
     255             : 
     256           0 :         case '?': // Suffix annotations: "!", "?", "!!", "!?", "?!", and "??"
     257             :         case '!': // "At most one such suffix annotation may appear per move"
     258             :                 return parser.visitPGN_Suffix(
     259           0 :                     input.read_token([](char c) { return c == '!' || c == '?'; }));
     260             :         }
     261             : 
     262             :         // "A symbol token is terminated just prior to the first non-symbol
     263             :         // character following the symbol character sequence."
     264     2724160 :         auto tok = input.read_token(is_PGNsymbol);
     265     2724160 :         bool epd = (section < 0 && std::count(tok.first, tok.second, '/') == 7);
     266     2724160 :         section = 1;
     267             : 
     268     2724160 :         if (epd) {
     269           4 :                 tok.second = input.read_line().second;
     270           4 :                 parser.visitPGN_EPD(tok);
     271           4 :                 return false;
     272             :         }
     273             : 
     274     2724156 :         auto notdigit = std::find_if_not(tok.first, tok.second, is_PGNdigit);
     275     2724156 :         if (notdigit == tok.first)
     276     1565565 :                 return parser.visitPGN_SANMove(tok);
     277             : 
     278     1158591 :         if (notdigit == tok.second)
     279     1158559 :                 return parser.visitPGN_MoveNum(tok);
     280             : 
     281          32 :         auto result = is_PGNtermination(tok);
     282          32 :         if (result) {
     283          32 :                 parser.visitPGN_ResultFinal(result);
     284          32 :                 return false;
     285             :         }
     286             : 
     287           0 :         return parser.visitPGN_Unknown(tok);
     288             : }
     289             : 
     290             : class InputMemory {
     291             :         const char* const begin_;
     292             :         const char* const end_;
     293             :         const char* it_;
     294             : 
     295             : public:
     296        2069 :         InputMemory(const char* begin, const char* end)
     297        2069 :             : begin_(begin), end_(end), it_(begin) {}
     298             : 
     299             :         /// Reads one character and advances the input sequence by one character.
     300     8012975 :         char sbumpc() {
     301     8012975 :                 assert(it_ != end_);
     302     8012975 :                 return *it_++;
     303             :         };
     304             : 
     305             :         /// Makes the most recently extracted character available again.
     306           2 :         void sungetc() {
     307           2 :                 assert(it_ != begin_ && it_ != end_);
     308           2 :                 --it_;
     309           2 :         }
     310             : 
     311             :         /// Returns true if there are no chars available.
     312     8028159 :         bool eof() const { return it_ == end_; };
     313             : 
     314             :         /// Returns the number of chars read.
     315        2070 :         std::size_t n_read() const { return std::distance(begin_, it_); }
     316             : 
     317             :         /// Returns true if the most recently extracted character was the first
     318             :         /// character of the line.
     319           1 :         bool first_column() const { return (n_read() < 2 || *(it_ - 2) == '\n'); };
     320             : 
     321             :         /// Returns true if the most recently extracted character was the last
     322             :         /// character of the line.
     323       15154 :         bool last_column() const { return eof() || *it_ == '\n' || *it_ == '\r'; }
     324             : 
     325             :         /// Returns the range of chars: [curr_char, '\n').
     326             :         /// The '\n' char is left as the next character to extract.
     327           9 :         std::pair<const char*, const char*> read_line() {
     328           9 :                 auto first = it_;
     329           9 :                 it_ = std::find(it_, end_, '\n');
     330           9 :                 return {first, it_};
     331             :         }
     332             : 
     333             :         /// Returns the range of chars: [curr_char, delim).
     334             :         /// The delim char is skipped.
     335      239193 :         std::pair<const char*, const char*> read_until(char delim) {
     336      239193 :                 auto first = it_;
     337      239193 :                 it_ = std::find(it_, end_, delim);
     338      239193 :                 auto second = (it_ == end_) ? it_ : it_++;
     339      239193 :                 return {first, second};
     340             :         }
     341             : 
     342             :         /// Returns the range of chars: [curr_char, cond == true].
     343             :         template <typename Cond>
     344       45324 :         std::pair<const char*, const char*> read_while(Cond cond) {
     345       45324 :                 auto first = it_;
     346       45324 :                 it_ = std::find_if_not(it_, end_, cond);
     347       45324 :                 return {first, it_};
     348             :         }
     349             : 
     350             :         /// Returns the range of chars: [last_extracted_char, cond == true].
     351             :         /// cond is not applied to last_extracted_char.
     352             :         template <typename Cond>
     353     2724544 :         std::pair<const char*, const char*> read_token(Cond cond) {
     354     2724544 :                 assert(it_ != begin_);
     355     2724544 :                 auto first = it_ - 1;
     356     2724544 :                 it_ = std::find_if_not(it_, end_, cond);
     357     2724544 :                 return {first, it_};
     358             :         }
     359             : };
     360             : 
     361             : } // namespace pgn_impl
     362             : 
     363             : namespace pgn {
     364             : 
     365             : /**
     366             :  * Read a PGN game from memory, grouping characters in tokens and dispatching
     367             :  * them to a PGN parser.
     368             :  * @param input:  the memory range containing the PGN game.
     369             :  * @param parser: will receive the tokens via visitPGN_* functions.
     370             :  *                Parsing is aborted if it returns false.
     371             :  * @returns a std::pair containing the number of chars parsed, and true if at
     372             :  * least a tag-pair token or a symbol token was dispatched.
     373             :  */
     374             : template <typename TVisitor>
     375        2069 : std::pair<std::size_t, bool> parse_game(pgn_impl::InputMemory input,
     376             :                                         TVisitor&& parser) {
     377        2069 :         int section = -1;
     378     8010936 :         do {
     379     8013005 :                 if (input.eof()) {
     380          30 :                         if (section >= 0)
     381          12 :                                 parser.visitPGN_inputEOF();
     382          30 :                         break;
     383             :                 }
     384     8012975 :         } while (pgn_impl::parse_token(input.sbumpc(), input, parser, section));
     385             : 
     386        2069 :         return {input.n_read(), section >= 0};
     387             : }
     388             : 
     389             : /**
     390             :  * Copy characters to a string normalizing and trimming white spaces, and
     391             :  * converting Latin-1 chars to UTF-8 sequences.
     392             :  *
     393             :  * The original PGN standard used a subset of ISO 8859/1 (Latin 1):
     394             :  * "Code value from 0 to 126 are the standard ASCII character set."
     395             :  * "Code value from 127 to 191 are not used for PGN data representation."
     396             :  * "Code value from 192 to 255 are mostly alphabetic printing characters with
     397             :  * various diacritical marks; their use is encouraged for those languages
     398             :  * that require such characters."
     399             :  * However this do not allow internationalization for comments and names
     400             :  * (players, sites, etc...); the common UTF-8 is a superior alternative.
     401             :  * @param src:     memory range with the source data.
     402             :  * @param dest:    the container where the characters will be stored.
     403             :  * @param destPos: the start position in @e dest where the chars will be stored.
     404             :  * @returns the number of '\n' chars in @e src.
     405             :  */
     406             : template <bool trim_spaces, typename TView, typename TString>
     407      234967 : std::size_t copy_norm(TView src, TString& dest, std::size_t destPos) {
     408      234967 :         std::size_t n_endLines = 0;
     409      234967 :         std::size_t pos = destPos;
     410      234967 :         dest.resize(pos + std::distance(src.first, src.second));
     411    68397385 :         while (src.first != src.second) {
     412    34081209 :                 unsigned char ch = *src.first++;
     413    34081209 :                 if (ch > 0xBF) {
     414         772 :                         unsigned char nxt = (src.first != src.second) ? *src.first : 0;
     415         772 :                         if (nxt < 0x80 || nxt > 0xBF) {
     416             :                                 // An invalid utf-8 sequence is considered a Latin1 char and
     417             :                                 // converted.
     418         231 :                                 dest.resize(dest.size() + 1);
     419         231 :                                 dest[pos++] = static_cast<unsigned char>(0xC3);
     420         231 :                                 ch = static_cast<unsigned char>(ch & 0xBF);
     421             :                         }
     422    34080437 :                 } else if (ch == '\n' || ch == '\r' || ch == '\t' || ch == '\v' ||
     423             :                            (trim_spaces && ch == ' ')) {
     424      371020 :                         if (ch == '\n')
     425      364016 :                                 ++n_endLines;
     426             : 
     427      920322 :                         if (pos == destPos || src.first == src.second ||
     428      742775 :                             pgn_impl::is_PGNwhitespace(*src.first) ||
     429          21 :                             (!trim_spaces && dest[pos - 1] == ' ')) {
     430             :                                 // Skip
     431      364167 :                                 dest.pop_back();
     432      364167 :                                 continue;
     433             :                         }
     434             : 
     435        6853 :                         ch = ' '; // Normalize all white spaces
     436             :                 }
     437    33717042 :                 dest[pos++] = ch;
     438             :         }
     439      234967 :         return n_endLines;
     440             : }
     441             : 
     442             : } // namespace pgn
     443             : 
     444             : #endif // _PGN_LEXER_H

Generated by: LCOV version 1.13