Scid  4.7.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
spellchk.cpp
Go to the documentation of this file.
1 /*
2 * Copyright (C) 2015 Fulvio Benini
3 * Copyright (c) 2001-2003 Shane Hudson (2nd part of the file)
4 
5 * This file is part of Scid (Shane's Chess Information Database).
6 *
7 * Scid is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation.
10 *
11 * Scid is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Scid. If not, see <http://www.gnu.org/licenses/>.
18 */
19 
20 #include "spellchk.h"
21 #include "date.h"
22 #include "filebuf.h"
23 #include "misc.h"
24 
25 
26 namespace {
27 
28 enum InfoType {
29  SPELL_SECTIONSTART,
30  SPELL_NEWNAME, SPELL_ALIAS, SPELL_PREFIX, SPELL_INFIX, SPELL_SUFFIX,
31  SPELL_BIO, SPELL_ELO,
32  SPELL_EMPTY, SPELL_OLDBIO, SPELL_UNKNOWN
33 };
34 
35 struct Parser {
36  char* name;
37  char* extra;
38  enum InfoType type;
39 
40  Parser(char* line);
41 };
42 
43 /**
44  * Parser::Parser() - Parse a "spelling" line.
45  *
46  * Fill data members doing the following tasks:
47  * - separate the optional comment (a comment starts with '#' and
48  * extend to the end of the line) from the name data.
49  * - remove leading and trailing white-spaces.
50  * - identify the type of data
51  */
52 Parser::Parser(char* line) {
53  ASSERT(line != 0);
54 
55  extra = strchr(line, '#');
56  if (extra != NULL) {
57  // Make [line, extra) a null terminated string
58  *extra++ = 0;
59  }
60  name = (char*) strTrimLeft(line);
62 
63  type = SPELL_UNKNOWN;
64  switch (*name) {
65  case 0:
66  type = SPELL_EMPTY;
67  break;
68  case '>':
69  type = SPELL_OLDBIO;
70  break;
71  case '=':
72  type = SPELL_ALIAS;
73  // Skip over "=" and spaces:
74  name++;
75  while (*name == ' ') { name++; }
76  break;
77  case '%':
78  if (strIsPrefix("%Elo ", name)) {
79  type = SPELL_ELO;
80  name += 5; //Skip "%Elo "
81  } else if (strIsPrefix("%Bio ", name)) {
82  type = SPELL_BIO;
83  name += 5; //Skip "%Bio "
84  } else if (strIsPrefix("%Prefix ", name)) {
85  type = SPELL_PREFIX;
86  } else if (strIsPrefix("%Infix ", name)) {
87  type = SPELL_INFIX;
88  } else if (strIsPrefix("%Suffix ", name)) {
89  type = SPELL_SUFFIX;
90  }
91  break;
92  case '@':
93  type = SPELL_SECTIONSTART;
94  name++; //Skip '@'
95  // Now check if there is a list of characters to exclude from
96  // comparisons, e.g: @PLAYER ", .-"
97  // would indicate to exclude dots, commas, spaces and dashes.
98  extra = strchr(name, '"');
99  if (extra != NULL) {
100  char* end = strchr(++extra, '"');
101  if (end != NULL) {
102  *end = 0;
103  } else {
104  extra = NULL;
105  }
106  }
107  break;
108  default:
109  type = SPELL_NEWNAME;
110  if (extra != NULL) {
111  // Spelling files can provide player informations like titles/gender,
112  // countries, highest elo, date of birth, date of death. For example:
113  // Polgar, Judit #gm+w HUN [2735] 1976
114  strTrimRight(extra);
115  }
116  }
117 }
118 
119 } // End of anonymous namespace
120 
121 
122 /**
123  * class SpellChkLoader - load data into a SpellChecker object
124  *
125  * This class take parsed "spelling" data and store it into the right
126  * data members of the associated SpellChecker object.
127  * Reading from a "spelling" file is not stateless and the Parser object
128  * cannot contain all the necessary data: a SpellChkLoader object keep track
129  * of the current nameT section and the current correct name.
130  * The SpellChkValidate object is used to log ignored data, usually
131  * caused by typos like "@Eol" or "@Preffix".
132  */
134  SpellChecker& sp_;
135  SpellChecker::SpellChkValidate& validate_;
136  nameT nt_;
137  int32_t nameIdx_;
138 
139 public:
140  SpellChkLoader(SpellChecker& sp, SpellChecker::SpellChkValidate& v)
141  : sp_(sp), validate_(v), nt_(NAME_INVALID), nameIdx_(-1) {
142  }
143 
144  errorT load(const Parser& data, bool* keepBuffer) {
145  ASSERT(keepBuffer != 0);
146  *keepBuffer = false;
147 
148  switch (data.type) {
149  case SPELL_SECTIONSTART:
150  nt_ = NameBase::NameTypeFromString(data.name);
152  if (data.extra != NULL) {
153  sp_.excludeChars_[nt_] = data.extra;
154  } else {
155  sp_.excludeChars_[nt_].clear();
156  }
157  nameIdx_ = -1;
158  return OK;
159  case SPELL_NEWNAME:
160  case SPELL_ALIAS:
161  case SPELL_PREFIX:
162  case SPELL_INFIX:
163  case SPELL_SUFFIX:
164  return nameSection(data, keepBuffer);
165  case SPELL_BIO:
166  case SPELL_ELO:
167  return playerInfo(data, keepBuffer);
168  case SPELL_EMPTY:
169  return OK;
170  case SPELL_OLDBIO:
171  case SPELL_UNKNOWN:
172  validate_.ignoredLine(data.name);
173  return OK;
174  }
175 
176  ASSERT(0);
177  return ERROR_CorruptData;
178  }
179 
180 private:
181  errorT nameSection(const Parser& data, bool* keepBuffer) {
182  // Must be in a valid name section
184 
185  switch (data.type) {
186  case SPELL_NEWNAME:
187  *keepBuffer = true;
188  ASSERT(sp_.names_[nt_].size() < (1ULL << 31));
189  nameIdx_ = static_cast<int32_t>(sp_.names_[nt_].size());
190  sp_.names_[nt_].push_back(data.name);
191  if (nt_ == NAME_PLAYER) {
192  sp_.pInfo_.push_back(data.extra);
193  }
194  /* FALLTHRU */
195  case SPELL_ALIAS:
196  if (nameIdx_ == -1) {
197  return ERROR_CorruptData;
198  } else {
199  sp_.idx_[nt_].push_back(SpellChecker::Idx(
200  sp_.normalizeAndTransform(nt_, data.name),
201  nameIdx_
202  ));
203  }
204  return OK;
205  case SPELL_PREFIX:
206  return sp_.general_[nt_].addPrefix(data.name);
207  case SPELL_INFIX:
208  return sp_.general_[nt_].addInfix(data.name);
209  case SPELL_SUFFIX:
210  return sp_.general_[nt_].addSuffix(data.name);
211  default:
212  ASSERT(0);
213  }
214 
215  return ERROR_CorruptData;
216  }
217 
218  errorT playerInfo(const Parser& data, bool* keepBuffer) {
219  // SPELL_BIO and SPELL_ELO are valid only for a PLAYER name
220  if (nt_ != NAME_PLAYER || nameIdx_ == -1) return ERROR_CorruptData;
221 
222  if (data.type == SPELL_BIO) {
223  *keepBuffer = true;
224  sp_.pInfo_[nameIdx_].bio_.push_back(data.name);
225  } else {
226  ASSERT(data.type == SPELL_ELO);
227  // if necessary, add empty PlayerElo objects
228  sp_.pElo_.resize(nameIdx_ + 1);
229  sp_.pElo_[nameIdx_].AddEloData(data.name);
230  }
231 
232  return OK;
233  }
234 };
235 
236 
237 /**
238  * SpellChecker::read() - Read a "spelling" file.
239  *
240  * This functions tries to open the @filename file and to load the data
241  * into the SpellChecker object.
242  * The object must be empty. In practice the requirement is to not call
243  * this function twice, because this is the only non-const member function.
244  * If the function fails (result != OK) the object state is undefined
245  * and the only valid operation is to destroy the object.
246  * If SPELLCHKVALIDATE is defined, it also creates a @filename.validate log.
247  */
248 errorT SpellChecker::read(const char* filename, const Progress& progress)
249 {
250  ASSERT(filename != NULL);
251  ASSERT(staticStrings_ == NULL);
252 
253  // Open the file and get the file size.
254  Filebuf file;
255  std::streamsize fileSize = -1;
256  if (file.open(filename, std::ios::in | std::ios::binary | std::ios::ate) != 0) {
257  fileSize = file.pubseekoff(0, std::ios::cur, std::ios::in);
258  file.pubseekoff(0, std::ios::beg, std::ios::in);
259  }
260  if (fileSize == -1) return ERROR_FileOpen;
261 
262  SpellChkValidate validate(filename, *this);
263 
264  // Parse the file lines
265  staticStrings_ = (char*) malloc(fileSize + 1);
266  char* bEnd = staticStrings_ + fileSize + 1;
267  char* line = staticStrings_;
268  size_t nRead;
269  uint report_i = 0;
270  std::streamsize report_done = 0;
271  SpellChkLoader loader(*this, validate);
272  while ((nRead = file.getline(line, std::distance(line, bEnd))) != 0) {
273  report_done += nRead;
274  if ((++report_i % 10000) == 0) {
275  if (!progress.report(report_done, fileSize))
276  return ERROR_UserCancel;
277  }
278 
279  bool keepBuffer;
280  errorT err = loader.load(Parser(line), &keepBuffer);
281  if (err != OK) return err;
282 
283  if (keepBuffer) line += nRead;
284  }
285  if (report_done != fileSize || file.sgetc() != EOF) return ERROR_FileRead;
286 
287  // Success:
288  if (pElo_.size() > 0) {
289  // if necessary, add empty PlayerElo objects
290  pElo_.resize(pInfo_.size());
291  validate.checkEloData();
292  }
293 
294  // Free unused memory
295  char* shrink = (char*) realloc(staticStrings_, 1 + std::distance(staticStrings_,line));
296  if (shrink != NULL && shrink != staticStrings_) {
297  // Unlikely, but realloc() moved the memory: update the pointers.
298  const char* oldAddr = staticStrings_;
299  staticStrings_ = shrink;
300  for (nameT i=0; i < NUM_NAME_TYPES; i++) {
301  for (auto& e : (names_[i]))
302  e = staticStrings_ + std::distance(oldAddr, e);
303  }
304  for (auto& e : pInfo_) {
305  e.comment_ = staticStrings_ + std::distance(oldAddr, e.comment_);
306  for (auto& bio : e.bio_) {
307  bio = staticStrings_ + std::distance(oldAddr, bio);
308  }
309  }
310  }
311 
312  // Sort the index
313  for (nameT i=0; i < NUM_NAME_TYPES; i++) {
314  std::sort(idx_[i].begin(), idx_[i].end());
315  validate.idxDuplicates(i);
316  }
317  return OK;
318 }
319 
320 
321 //////////////////////////////////////////////////////////////////////
322 //
323 // FILE: spellchk.cpp
324 // SpellChecker class methods
325 //
326 // Part of: Scid (Shane's Chess Information Database)
327 // Version: 3.5
328 //
329 // Notice: Copyright (c) 2001-2003 Shane Hudson. All rights reserved.
330 //
331 // Author: Shane Hudson (sgh@users.sourceforge.net)
332 //
333 //////////////////////////////////////////////////////////////////////
334 
335 // Retrieve the list of Rating figures for given player (aka node) from the given (ssp) string
336 // The string is formatted as:
337 // [%Elo ]<year>:<<rating>|?>,...,<<rating>|?> [<year>:<<rating>|?>,...,<<rating>|?>...]
338 //
339 // The ratings are stored in a rating array for this player, in the order of appearance
340 // and without any assumption on the period that the rating refers to.
341 // This is accomplished by assuming that for all years the same number of rating figures
342 // could be given (see ELO_RATINGS_PER_YEAR above).
343 //
344 // The (external) algorithm to map ratings to actual periods must be able to cope with
345 // the holes that - as a consequence - will appear in the rating graph constructed here!
346 //
347 void PlayerElo::AddEloData(const char * str)
348 {
349  while (1) {
350  // Get the year in which the rating figures to follow were published
351  //
352  str = strTrimLeft (str);
353  if (! isdigit(static_cast<unsigned char>(*str))) { break; }
354  uint16_t year = strGetUnsigned (str);
355  str += 4;
356  if (*str != ':') { break; }
357  str++;
358 
359  // Now read all the ratings for this year:
360  //
361  eloT elo = 0;
362  while (1) {
363  if (isdigit(static_cast<unsigned char>(*str))) {
364  elo = strGetUnsigned (str);
365  str += 4;
366  } else if (*str == '?') {
367  elo = 0;
368  str++;
369  } else if (*str == ' ') {
370  break;
371  } else {
372  // Invalid data seen:
373  return;
374  }
375 
376  elo_.push_back(std::make_pair(year, elo));
377 
378  if (*str == ',') { str++; }
379  }
380  }
381 }
382 
383 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~
384 // PlayerInfo::GetTitle:
385 // Extract the first title appearing in the player
386 // comment, and return it.
387 const char *
389 {
390  static const char * titles[] = {
391  "gm", "im", "fm",
392  "wgm", "wim", "wfm", "w",
393  "cgm", "cim", "hgm",
394  NULL
395  };
396  const char ** titlePtr = titles;
397 
398  const char* comment = GetComment();
399  if (*comment == 0) { return ""; }
400 
401  while (*titlePtr != NULL) {
402  if (strIsPrefix (*titlePtr, comment)) { return *titlePtr; }
403  titlePtr++;
404  }
405  return "";
406 }
407 
408 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~
409 // PlayerInfo::GetLastCountry:
410 // Scan the player comment string for the country field (which
411 // is the second field, after the title), then return the
412 // last three letters in the country field, or the empty string
413 // if the country field is less than 3 characters long.
414 const char *
416 {
417  static char country[4];
418  country[0] = 0;
419 
420  const char* start = GetComment();
421  if (*start == 0) { return ""; }
422 
423  // Skip over the title field:
424  while (*start != ' ' && *start != 0) { start++; }
425  while (*start == ' ') { start++; }
426 
427  const char * end = start;
428  int length = 0;
429  while (*end != ' ' && *end != 0) { end++; length++; }
430  // Return the final three characters of the country field:
431  if (length >= 3) {
432  for (int i=0; i < 3; i++) { country[i] = start[length-3 + i]; }
433  country[3] = 0;
434  }
435  return country;
436 }
437 
438 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~
439 // PlayerInfo::GetPeakRating:
440 // Scan the player comment string for the peak rating
441 // field (which is contained in brackets), convert it
442 // to an unsigned integer, and return it.
443 eloT
445 {
446  const char* s = GetComment();
447  if (*s == 0) { return 0; }
448 
449  while (*s != '[' && *s != 0) { s++; }
450  if (*s != '[') { return 0; }
451  s++;
452  return strGetUnsigned (s);
453 }
454 
455 
456 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~
457 // PlayerInfo::GetBirthdate:
458 // Scan the player comment string for the birthdate
459 // field, convert it to a date, and return it.
460 dateT
462 {
463  const char* s = GetComment();
464  if (*s == 0) { return ZERO_DATE; }
465 
466  // Find the end-bracket character after the rating:
467  while (*s != ']' && *s != 0) { s++; }
468  if (*s != ']') { return ZERO_DATE; }
469  s++;
470  // Now skip over any spaces:
471  while (*s == ' ') { s++; }
472  if (*s == 0) { return ZERO_DATE; }
473  return date_EncodeFromString (s);
474 }
475 
476 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~
477 // PlayerInfo::GetDeathdate:
478 // Scan the player comment string for the deathdate
479 // field, convert it to a date, and return it.
480 dateT
482 {
483  const char* s = GetComment();
484  if (*s == 0) { return ZERO_DATE; }
485 
486  // Find the end-bracket character after the rating:
487  while (*s != ']' && *s != 0) { s++; }
488  if (*s != ']') { return ZERO_DATE; }
489  s++;
490  // Now skip over any spaces:
491  while (*s == ' ') { s++; }
492  // Now skip over the birthdate and dashes:
493  while (*s != 0 && *s != '-') { s++; }
494  while (*s == '-') { s++; }
495  if (*s == 0) { return ZERO_DATE; }
496  return date_EncodeFromString (s);
497 }
498 
499 //////////////////////////////////////////////////////////////////////
500 // EOF: spellchk.cpp
501 //////////////////////////////////////////////////////////////////////
const char * strTrimLeft(const char *target, const char *trimChars)
Definition: misc.cpp:314
const char * getTitle() const
Definition: spellchk.cpp:388
const errorT OK
Definition: error.h:23
bool strIsPrefix(const char *prefix, const char *longStr)
Definition: misc.h:331
playerInfo?player?
Definition: pinfo.tcl:347
uint dateT
Definition: common.h:147
#define ASSERT(f)
Definition: common.h:59
errorT addPrefix(const char *s)
add*fix() - add a general correction
Definition: spellchk.h:111
Definition: misc.h:63
names
Definition: tablebase.tcl:257
SpellChkLoader(SpellChecker &sp, SpellChecker::SpellChkValidate &v)
Definition: spellchk.cpp:140
class SpellChecker - name spelling
Definition: spellchk.h:259
const dateT ZERO_DATE
Definition: date.h:35
errorT addInfix(const char *s)
Definition: spellchk.h:112
const errorT ERROR_FileRead
Definition: error.h:33
const errorT ERROR_UserCancel
Definition: error.h:27
void strTrimRight(char *target, const char *trimChars, size_t nTrimCh)
Definition: misc.h:427
static bool IsValidNameType(nameT nt)
Validate a nameT type.
Definition: namebase.h:220
sort?type?
Definition: analysis.tcl:320
uint32_t uint
Definition: common.h:91
ushort eloT
Definition: common.h:164
eloT getPeakRating() const
Definition: spellchk.cpp:444
const errorT ERROR_CorruptData
Definition: error.h:46
uint32_t strGetUnsigned(const char *str)
Definition: misc.h:195
unsigned short errorT
Definition: error.h:20
bool report(size_t done, size_t total) const
Definition: misc.h:75
dateT getBirthdate() const
Definition: spellchk.cpp:461
Adds some helper functions to std::filebuf:
Definition: filebuf.h:35
class SpellChkLoader - load data into a SpellChecker object
Definition: spellchk.cpp:133
unsigned nameT
Definition: common.h:153
Extends the std:filebuf class with performance improvements.
dateT getDeathdate() const
Definition: spellchk.cpp:481
errorT load(const Parser &data, bool *keepBuffer)
Definition: spellchk.cpp:144
errorT addSuffix(const char *s)
Definition: spellchk.h:113
InfoType
Definition: spellchk.cpp:28
const char * getLastCountry() const
Definition: spellchk.cpp:415
static nameT NameTypeFromString(const char *str)
Match a string to a nameT.
Definition: namebase.h:229
const errorT ERROR_FileOpen
Definition: error.h:31
size_t getline(char *str, size_t count)
Equivalent to std::fstream::getline, but faster (no sentry [27.7.2.1.3]).
Definition: filebuf.h:86
void AddEloData(const char *str)
Definition: spellchk.cpp:347
dateT date_EncodeFromString(const char *str)
Definition: date.h:127