Scid  4.6.5
spellchk.h
Go to the documentation of this file.
1 /*
2 # Copyright (C) 2015 Fulvio Benini
3 
4 * This file is part of Scid (Shane's Chess Information Database).
5 *
6 * Scid is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation.
9 *
10 * Scid is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with Scid. If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #ifndef SPELLCHK_H
20 #define SPELLCHK_H
21 
22 #include "namebase.h"
23 #include "date.h"
24 #include <string>
25 #include <vector>
26 #include <utility>
27 #include <algorithm>
28 #ifdef SPELLCHKVALIDATE
29 #include <fstream>
30 #endif
31 
32 /*
33 * A "spelling" file contains the correct names for players, events, sites and rounds.
34 * Optionally it can provide further informations for players like elo, birthdate, etc..
35 * See the header of spelling.ssp for a more detailed description of the format.
36 */
37 
38 
39 /**
40  * class NameNormalizer - apply general corrections to a name
41  *
42  * Spelling files can provide general corrections in the form:
43  * %Prefix "wrong prefix" "correct prefix"
44  * %Infix "wrong suffix" "correct suffix"
45  * %Suffix "wrong suffix" "correct suffix"
46  *
47  * Example:
48  * %Prefix "II " "2. "
49  * %Infix "3rd " "3. "
50  * %Suffix "(Italy)" "ITA"
51  * "II champ 3rd II 3rd (Italy) (Italy)" --> "2. champ 3. II 3. (Italy) ITA"
52  */
54  typedef std::vector< std::pair<std::string,std::string> > Cont;
55  Cont prefix_;
56  Cont infix_ ;
57  Cont suffix_;
58 
59 public:
60  /**
61  * normalize() - correct a name
62  * @name: the name to be corrected
63  *
64  * Return: count of corrections applied
65  */
66  size_t normalize(std::string* name) const {
67  size_t corrections = 0;
68  Cont::const_iterator it;
69 
70  for (it = prefix_.begin(); it != prefix_.end(); it++) {
71  const std::string& s = it->first;
72  if (name->compare(0, s.length(), s) == 0) {
73  corrections++;
74  name->replace(0, s.length(), it->second);
75  break;
76  }
77  }
78 
79  for (it = infix_.begin(); it != infix_.end(); it++) {
80  const std::string& s = it->first;
81  size_t pos = name->find(s);
82  while (pos != std::string::npos) {
83  corrections++;
84  name->replace(pos, s.length(), it->second);
85  pos = name->find(s, pos + it->second.length());
86  }
87  }
88 
89  for (it = suffix_.begin(); it != suffix_.end(); it++) {
90  const std::string& s = it->first;
91  if (name->length() < s.length()) continue;
92  size_t pos = name->length() - s.length();
93  if (name->compare(pos, s.length(), s) == 0) {
94  corrections++;
95  name->replace(pos, s.length(), it->second);
96  break;
97  }
98  }
99 
100  return corrections;
101  }
102 
103  /**
104  * add*fix() - add a general correction
105  *
106  * Adds a general prefix, infix or suffix correction.
107  * Syntax for @s is:
108  * %Suffix "wrong suffix" "correct suffix"
109  * Return: OK if successful
110  */
111  errorT addPrefix(const char* s) { return add(prefix_, s); }
112  errorT addInfix (const char* s) { return add(infix_, s); }
113  errorT addSuffix(const char* s) { return add(suffix_,s); }
114 
115 private:
116  errorT add(Cont& v, const char* s) {
117  ASSERT(s != 0);
118  std::vector<size_t> parse;
119  for (size_t i=0; *(s+i) != 0; i++) {
120  if (*(s+i) == '"') parse.push_back(i);
121  }
122  if (parse.size() != 4) return ERROR_CorruptData;
123  parse[0] += 1; //skip "
124  parse[1] -= parse[0]; //n_chars
125  if (parse[1] == 0) return ERROR_CorruptData;
126  parse[2] += 1; //skip "
127  parse[3] -= parse[2]; //n_chars
128  v.push_back(std::make_pair(
129  std::string(s + parse[0], parse[1]),
130  std::string(s + parse[2], parse[3])
131  ));
132  return OK;
133  }
134 };
135 
136 
137 /**
138  * class PlayerElo - elo ratings of a player
139  *
140  * Spelling files can provide elo ratings of a player in the form:
141  * %Elo YEAR:ELO_1PERIOD,ELO_2PERIOD,ELO_3PERIOD,... YEAR:ELO_1PERIOD,...
142  */
143 class PlayerElo {
144  std::vector< std::pair<uint16_t, eloT> > elo_;
145 
146 public:
147  void AddEloData(const char* str);
148 
149  eloT getElo (dateT date) const {
150  uint year = date_GetYear (date);
151  std::vector< std::pair<uint16_t, eloT> >::const_iterator itBegin = elo_.begin();
152  while (itBegin != elo_.end() && itBegin->first != year) itBegin++;
153  std::vector< std::pair<uint16_t, eloT> >::const_iterator itEnd = itBegin;
154  while (itEnd != elo_.end() && itEnd->first == year) itEnd++;
155 
156 
157  size_t n = std::distance(itBegin, itEnd);
158  if (n == 0) return 0; // No data for that year
159 
160  uint month = date_GetMonth (date);
161  if (month == 0 || month > 12) month = 0;
162  else month -= 1;
163 
164  size_t idx;
165  if (year == 2009 && n == 5) {
166  //2 trimonthly + 3 bimonthly
167  idx = (month < 6) ? month / 3 : (month - 2)/2;
168 
169  } else if (year == 2012 && n == 9) {
170  //3 bimonthly + 6 monthly
171  idx = (month < 6) ? month / 2 : month - 3;
172 
173  } else if (year > 2012) {
174  // monthly
175  if (month >= n) return 0;
176  idx = month;
177 
178  } else {
179  idx = month * n / 12;
180  }
181 
182  return (itBegin + idx)->second;
183  }
184 
185 #ifdef SPELLCHKVALIDATE
186  std::string isValid() const {
187  for (size_t i=1, n=elo_.size(); i < n; i++) {
188  if (elo_[i].first < elo_[i -1].first) return "unsorted";
189  }
190 
191  #if CPP11_SUPPORT
192  auto count = [this](uint year) {
193  return std::count_if(this->elo_.begin(), this->elo_.end(),
194  [&](const std::pair<uint16_t, eloT>& e) { return e.first == year; });
195  };
196 
197  auto expected = [](uint year) {
198  if (year < 1990) return 1;
199  if (year < 2001) return 2;
200  if (year < 2009) return 4;
201  if (year < 2010) return 5;
202  if (year < 2012) return 6;
203  if (year < 2013) return 9;
204  return 12;
205  };
206 
207  for (uint y=1970; y<2015; y++) {
208  auto n = count(y);
209  if (n == 0) continue;
210  if (n != expected(y))
211  return to_string(y) + ": " + to_string(n) + "(" + to_string(expected(y)) + ")";
212  }
213  #endif
214 
215  return std::string();
216  }
217 #endif
218 };
219 
220 
221 /**
222  * class PlayerInfo - player informations
223  *
224  * Spelling files can provide player informations like titles/gender,
225  * countries, highest elo, date of birth, date of death. For example:
226  * Polgar, Judit #gm+w HUN [2735] 1976
227  *
228  * Generic information can be provided in the form:
229  * %Bio This is a generic information
230  */
231 class PlayerInfo {
232  const char* comment_;
233  std::vector<const char*> bio_;
234 
235  friend class SpellChkLoader;
236  friend class SpellChecker;
237 
238 public:
239  PlayerInfo(const char* s) : comment_(s) {}
240  const char* getTitle() const;
241  const char* getLastCountry() const;
242  dateT getBirthdate() const;
243  dateT getDeathdate() const;
244  eloT getPeakRating() const;
245  const char* GetComment() const {
246  return (comment_ != 0) ? comment_ : "";
247  }
248 };
249 
250 
251 /**
252  * class SpellChecker - name spelling
253  *
254  * Read a spell file and allow to retrieve corrected names and players data.
255  * if SPELLCHKVALIDATE is defined also check the spell file for errors.
256  */
258  struct Idx {
259  std::string alias;
260  int32_t idx;
261 
262  Idx() {}
263  Idx(const std::string& a, int32_t i) : alias(a), idx(i) {}
264  bool operator<(const Idx& b) const { return alias < b.alias; }
265  bool operator<(const std::string& b) const { return alias < b; }
266  };
267  typedef std::vector<Idx>::const_iterator IdxIt;
268 
269  NameNormalizer general_[NUM_NAME_TYPES];
270  std::string excludeChars_[NUM_NAME_TYPES];
271  std::vector<Idx> idx_[NUM_NAME_TYPES];
272  std::vector<const char*> names_[NUM_NAME_TYPES];
273  std::vector<PlayerInfo> pInfo_;
274  std::vector<PlayerElo> pElo_;
275  char* staticStrings_;
276 
277  friend class SpellChkLoader;
278 
279 public:
281  free(staticStrings_);
282  }
283 
284  /**
285  * Create() - Create a new SpellChecker object
286  *
287  * Create a new SpellChecker reading from @filename.
288  * It's the caller's responsibility to free the object with "delete".
289  * Return:
290  * - OK and a pointer to the new object
291  * - on error the ERROR_*CODE* and NULL
292  */
293  static std::pair<errorT, SpellChecker*> Create(const char* filename,
294  const Progress& progress) {
295  SpellChecker* res = new SpellChecker;
296  errorT err = res->read(filename, progress);
297  if (err != OK) {
298  delete res;
299  res = NULL;
300  }
301  return std::make_pair(err, res);
302  }
303 
304  /**
305  * find() - search for correct names
306  * @nt: the type of the name to be corrected
307  * @name: the name to be corrected
308  * @nMaxRes: max size of the returned vector
309  *
310  * Return: a vector of correct names.
311  * @name will be normalized removing excludeChars_[@nt].
312  * If an exact match for normalized @name is found the result vector will
313  * contain only the corresponding correct name, otherwise will contain all
314  * the correct names that have @name as a prefix.
315  */
316  std::vector<const char*> find(const nameT& nt, const char* name, uint nMaxRes = 10) const {
317  ASSERT(nt < NUM_NAME_TYPES);
318  ASSERT(name != 0);
319  std::vector<const char*> res;
320  std::pair<IdxIt, IdxIt> it;
321  if (nt != NAME_PLAYER) it = idxFind(nt, name);
322  else it = idxFindPlayer(name);
323  for (; it.first != it.second && res.size() < nMaxRes; it.first++) {
324  const char* corrected = names_[nt][it.first->idx];
325  if (std::find(res.begin(), res.end(), corrected) == res.end()) {
326  res.push_back(corrected);
327  }
328  }
329  return res;
330  }
331 
332  const NameNormalizer& getGeneralCorrections(const nameT& nt) const {
333  ASSERT(nt < NUM_NAME_TYPES);
334  return general_[nt];
335  }
336 
337  /**
338  * SpellChecker::getPlayerInfo() - get extra info about a player
339  *
340  * Get extra data like titles/gender, countries, highest elo,
341  * date of birth, date of death or biographic informations.
342  * Return:
343  * - on success a pointer to a valid PlayerInfo object containing
344  * the available data. If @bio != 0 the vector is filled with
345  * the available biographic informations.
346  * - if @name is not found or is ambiguous (match multiple players)
347  * returns NULL and @bio is untouched.
348  */
349  const PlayerInfo* getPlayerInfo(const char* name,
350  std::vector<const char*>* bio = 0) const {
351  ASSERT(name != 0);
352  IdxIt it = idxFindPlayerUnambiguous(name);
353  if (it == idx_[NAME_PLAYER].end()) return 0; // not found
354 
355  if (bio != 0) *bio = pInfo_[it->idx].bio_;
356  return &(pInfo_[it->idx]);
357  }
358 
359  const PlayerElo* getPlayerElo(const char* name) const {
360  ASSERT(name != 0);
361  if (!hasEloData()) return 0;
362  IdxIt it = idxFindPlayerUnambiguous(name);
363  if (it == idx_[NAME_PLAYER].end()) return 0; // not found
364  return &(pElo_[it->idx]);
365  }
366 
367  bool hasEloData () const {
368  return pElo_.size() != 0;
369  }
370 
371  size_t numCorrectNames(const nameT& nt) const {
372  ASSERT(nt < NUM_NAME_TYPES);
373  return names_[nt].size();
374  }
375 
376 private:
377  SpellChecker() : staticStrings_(NULL) {}
378  SpellChecker(const SpellChecker&);
379  SpellChecker& operator=(const SpellChecker&);
380 
381  errorT read(const char* filename, const Progress& progress);
382 
383  std::string normalizeAndTransform(const nameT& nt, const char* s) const {
384  std::string res;
385  for (const char* i = s; *i != 0; i++) {
386  if (excludeChars_[nt].find(*i) != std::string::npos) continue;
387 
388  res += *i;
389  }
390  return res;
391  }
392 
393  std::pair<IdxIt, IdxIt> idxFind(const nameT& nt, const char* prefix) const {
394  std::pair<IdxIt, IdxIt> res;
395  std::string s = normalizeAndTransform(nt, prefix);
396  res.first = std::lower_bound(idx_[nt].begin(), idx_[nt].end(), s);
397  for (res.second = res.first; res.second != idx_[nt].end(); res.second++) {
398  if (res.second->alias.compare(0, s.length(), s) != 0) break;
399  if (res.second->alias == s) return std::make_pair(res.second, res.second +1);
400  }
401  return res;
402  }
403 
404  std::pair<IdxIt, IdxIt> idxFindPlayer(const char* prefix) const {
405  std::pair<IdxIt, IdxIt> res = idxFind(NAME_PLAYER, prefix);
406  if (res.first == res.second) {
407  // For spelling of player names (not other types), Scid will also try
408  // to move the text after the last space in the name to the start of
409  // the name for correction purposes, when it cannot find a correction.
410  // This is done to correct names where the surname is last.
411  std::string s = prefix;
412  size_t pos = s.rfind(' ');
413  if (pos != std::string::npos) {
414  std::string inv = s.substr(pos);
415  inv.append(s, 0, pos);
416  return idxFind(NAME_PLAYER, inv.c_str());
417  }
418  }
419  return res;
420  }
421 
422  IdxIt idxFindPlayerUnambiguous(const char* name) const {
423  std::pair<IdxIt, IdxIt> it = idxFindPlayer(name);
424  if (it.first == it.second) return idx_[NAME_PLAYER].end();
425 
426  for (IdxIt i = it.first; i != it.second; i++) {
427  if (i->idx != it.first->idx) //ambiguous
428  return idx_[NAME_PLAYER].end();
429  }
430  return it.first;
431  }
432 
433 
434 #ifndef SPELLCHKVALIDATE
435  class SpellChkValidate {
436  public:
437  SpellChkValidate(const char*, const SpellChecker&) {}
438  void ignoredLine(const char*) {}
439  void idxDuplicates(const nameT&) {}
440  void checkEloData() {}
441  };
442 #else
443  class SpellChkValidate {
444  const SpellChecker& spell_;
445  std::ofstream f_;
446 
447  public:
448  SpellChkValidate(const char* spellfile, const SpellChecker& sp) : spell_(sp) {
449  f_.open(spellfile + std::string(".validate"));
450  }
451  void ignoredLine(const char* line) {
452  f_ << "Ignored line:" << std::endl;
453  f_ << line << std::endl;
454  f_ << std::endl;
455  }
456  static bool cmpIdxAlias(const Idx& a, const Idx& b) {
457  return a.alias == b.alias;
458  }
459  void idxDuplicates(const nameT& nt) {
460  IdxIt it = spell_.idx_[nt].begin();
461  IdxIt it_end = spell_.idx_[nt].end();
462  for (;;) {
463  it = std::adjacent_find(it, it_end, cmpIdxAlias);
464  if (it == it_end) return;
465 
466  IdxIt it_endDuplicates = std::upper_bound(it, it_end, *it);
467  f_ << "Duplicate hash: " << it->alias << std::endl;
468  for(; it != it_endDuplicates; it++) {
469  f_ << spell_.names_[nt][it->idx];
470  f_ << " - Idx:" << it->idx << std::endl;
471  }
472  f_ << std::endl;
473  }
474  }
475  void checkEloData() {
476  for (size_t i=0, n = spell_.pElo_.size(); i < n; i++) {
477  std::string s = spell_.pElo_[i].isValid();
478  if (! s.empty()) {
479  f_ << "Elo error: " << s << " --- ";
480  f_ << spell_.names_[NAME_PLAYER][i] << std::endl;
481  }
482  }
483  }
484  };
485 #endif
486 
487 };
488 
489 
490 #endif
uint date_GetYear(dateT date)
Definition: date.h:55
size_t numCorrectNames(const nameT &nt) const
Definition: spellchk.h:371
const errorT OK
Definition: error.h:23
std::vector< const char * > find(const nameT &nt, const char *name, uint nMaxRes=10) const
Definition: spellchk.h:316
uint dateT
Definition: common.h:155
#define ASSERT(f)
Definition: common.h:67
errorT addPrefix(const char *s)
add*fix() - add a general correction
Definition: spellchk.h:111
class PlayerElo - elo ratings of a player
Definition: spellchk.h:143
Definition: misc.h:124
names
Definition: tablebase.tcl:260
std::string to_string(int val)
Definition: misc.h:153
const char * GetComment() const
Definition: spellchk.h:245
class SpellChecker - name spelling
Definition: spellchk.h:257
size_t normalize(std::string *name) const
Definition: spellchk.h:66
const NameNormalizer & getGeneralCorrections(const nameT &nt) const
Definition: spellchk.h:332
int find(const char *filename)
find() - search for a database.
Definition: dbasepool.cpp:51
bool hasEloData() const
Definition: spellchk.h:367
errorT addInfix(const char *s)
Definition: spellchk.h:112
const PlayerElo * getPlayerElo(const char *name) const
Definition: spellchk.h:359
uint nameT
Definition: namebase.h:29
const PlayerInfo * getPlayerInfo(const char *name, std::vector< const char * > *bio=0) const
Definition: spellchk.h:349
uint32_t uint
Definition: common.h:99
ushort eloT
Definition: common.h:160
const errorT ERROR_CorruptData
Definition: error.h:46
unsigned short errorT
Definition: error.h:20
class NameNormalizer - apply general corrections to a name
Definition: spellchk.h:53
eloT getElo(dateT date) const
Definition: spellchk.h:149
class SpellChkLoader - load data into a SpellChecker object
Definition: spellchk.cpp:133
PlayerInfo(const char *s)
Definition: spellchk.h:239
errorT addSuffix(const char *s)
Definition: spellchk.h:113
uint date_GetMonth(dateT date)
Definition: date.h:64
static std::pair< errorT, SpellChecker * > Create(const char *filename, const Progress &progress)
Create() - Create a new SpellChecker object.
Definition: spellchk.h:293
class PlayerInfo - player informations
Definition: spellchk.h:231