Line data Source code
1 : /*
2 : * Copyright (C) 2017 Fulvio Benini
3 :
4 : * This file is part of Scid (Shane's Chess Information Database).
5 : *
6 : * Scid is free software: you can redistribute it and/or modify
7 : * it under the terms of the GNU General Public License as published by
8 : * the Free Software Foundation.
9 : *
10 : * Scid is distributed in the hope that it will be useful,
11 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : * GNU General Public License for more details.
14 : *
15 : * You should have received a copy of the GNU General Public License
16 : * along with Scid. If not, see <http://www.gnu.org/licenses/>.
17 : *
18 : */
19 :
20 : /** @file
21 : * Implements the CodecSCID4 class, which manages the databases encoded
22 : * in Scid format version 4.
23 : */
24 :
25 : #include "codec_scid4.h"
26 : #include <algorithm>
27 :
28 : namespace {
29 :
30 : /**
31 : * A NameBase file starts with an header containing:
32 : * - header_magic (8 bytes): identify the file format
33 : * - unused (4 bytes): obsolete timeStamp
34 : * - number of NAME_PLAYER names stored in the file (3 bytes)
35 : * - number of NAME_EVENT names stored in the file (3 bytes)
36 : * - number of NAME_SITE names stored in the file (3 bytes)
37 : * - number of NAME_ROUND names stored in the file (3 bytes)
38 : * - unused (12 bytes): obsolete max frequency
39 : * Names are stored in alphabetical order using front-coding and each record is
40 : * composed by:
41 : * - name_id (2-3 bytes): the idx (idNumberT) stored in the Index (.si4) file
42 : * - unused (1-3 bytes): obsolete frequency
43 : * - length (1 byte): the total number of bytes of the name (max 255)
44 : * - prefix (1 byte): the number of bytes in common with the previous name
45 : * - name (0-255 bytes): the part of the name that differs from the previous
46 : * one.
47 : */
48 : const char* NAMEBASE_MAGIC = "Scid.sn";
49 :
50 : /**
51 : * Read a SCIDv4 NameBase file into memory.
52 : * @param filename: the full path of the file to open.
53 : * @param fMode: a valid file mode.
54 : * @param nb: reference to the object where the names will be stored.
55 : * @returns OK if successful or an error code.
56 : */
57 7 : errorT namefileRead(const char* filename, fileModeT fmode, NameBase& nb) {
58 7 : auto nb_data = nb.getData();
59 7 : auto& map = std::get<0>(nb_data);
60 7 : auto& names = std::get<1>(nb_data);
61 7 : auto& eloV = std::get<2>(nb_data);
62 :
63 14 : Filebuf file;
64 7 : if (file.Open(filename, fmode) != OK)
65 0 : return ERROR_FileOpen;
66 :
67 7 : char Header_magic[9] = {0}; // magic identifier must be "Scid.sn"
68 7 : file.sgetn(Header_magic, 8);
69 7 : if (strcmp(Header_magic, NAMEBASE_MAGIC) != 0)
70 0 : return ERROR_BadMagic;
71 :
72 : // *** Compatibility ***
73 : // Even if timeStamp is not used we still need to read the bytes
74 7 : file.ReadFourBytes();
75 : // ***
76 :
77 : idNumberT Header_numNames[NUM_NAME_TYPES];
78 7 : Header_numNames[NAME_PLAYER] = file.ReadThreeBytes();
79 7 : Header_numNames[NAME_EVENT] = file.ReadThreeBytes();
80 7 : Header_numNames[NAME_SITE] = file.ReadThreeBytes();
81 7 : Header_numNames[NAME_ROUND] = file.ReadThreeBytes();
82 :
83 : // *** Compatibility ***
84 : // Even if frequency is no longer used we still need to read the bytes
85 : uint obsolete_maxFreq[NUM_NAME_TYPES];
86 7 : obsolete_maxFreq[NAME_PLAYER] = file.ReadThreeBytes();
87 7 : obsolete_maxFreq[NAME_EVENT] = file.ReadThreeBytes();
88 7 : obsolete_maxFreq[NAME_SITE] = file.ReadThreeBytes();
89 7 : obsolete_maxFreq[NAME_ROUND] = file.ReadThreeBytes();
90 : // ***
91 :
92 7 : eloV.resize(Header_numNames[NAME_PLAYER], 0);
93 35 : for (nameT nt : {NAME_PLAYER, NAME_EVENT, NAME_SITE, NAME_ROUND}) {
94 28 : names[nt].resize(Header_numNames[nt]);
95 : idNumberT id;
96 56 : std::string prevName;
97 2425 : for (idNumberT i = 0; i < Header_numNames[nt]; i++) {
98 2397 : if (Header_numNames[nt] >= 65536) {
99 0 : id = file.ReadThreeBytes();
100 : } else {
101 2397 : id = file.ReadTwoBytes();
102 : }
103 :
104 : // *** Compatibility ***
105 : // Even if frequency is no longer used we still need to read the
106 : // bytes Frequencies can be stored in 1, 2 or 3 bytes:
107 2397 : if (obsolete_maxFreq[nt] >= 65536) {
108 0 : file.ReadThreeBytes();
109 2397 : } else if (obsolete_maxFreq[nt] >= 256) {
110 13 : file.ReadTwoBytes();
111 : } else { // Frequencies all <= 255: fit in one byte
112 2384 : file.ReadOneByte();
113 : }
114 : // ***
115 :
116 : // Read the name string.
117 : // All strings EXCEPT the first are front-coded.
118 2397 : int length = file.ReadOneByte();
119 2397 : int prefix = (i > 0) ? file.ReadOneByte() : 0;
120 2397 : if (prefix > length)
121 0 : return ERROR_Corrupt;
122 :
123 2397 : char* name = new char[length + 1];
124 2397 : std::copy_n(prevName.c_str(), prefix, name);
125 2397 : std::streamsize extra_chars = length - prefix;
126 2397 : if (extra_chars != file.sgetn(name + prefix, extra_chars)) {
127 0 : delete[] name;
128 0 : return ERROR_FileRead;
129 : }
130 2397 : name[length] = 0;
131 2397 : prevName.assign(name, length);
132 :
133 2397 : if (id < Header_numNames[nt] && names[nt][id] == 0) {
134 2397 : names[nt][id].reset(name);
135 2397 : map[nt].insert(map[nt].end(), std::make_pair(name, id));
136 : } else {
137 0 : delete[] name;
138 0 : return ERROR_Corrupt;
139 : }
140 : }
141 :
142 28 : if (map[nt].size() != names[nt].size())
143 0 : return ERROR_Corrupt;
144 : }
145 :
146 7 : return OK;
147 : }
148 :
149 1 : bool assert_sorted(const char* str1, const char* str2) {
150 : // *** Compatibility ***
151 : // Older code used a custom StrTree class with a peculiar sorting:
152 : // - the first char was interpreted as an unsigned char;
153 : // - the remaining part was compared with the function
154 : // strComapare(),
155 : // which converts the chars to ints, and is not consistent with
156 : // the standard function strcmp().
157 : // The old StrTree class did also have unpredictable behaviors when
158 : // fed with names not sorted according to that criteria, for example
159 : // it could create Namebase objects with duplicate entries.
160 : // ***
161 1 : if (*str1 == *str2)
162 1 : return strCompare(str1, str2) < 0;
163 :
164 0 : return static_cast<uint>(*str1) < static_cast<uint>(*str2);
165 : }
166 :
167 : /**
168 : * Write a SCIDv4 NameBase file.
169 : * @param filename: the full path of the file to open.
170 : * @param nb: reference to the object where the names will be stored.
171 : * @returns OK if successful or an error code.
172 : */
173 : template <typename TCont, typename TFreq>
174 12 : errorT namefileWrite(const char* filename, const TCont& names_ids,
175 : const TFreq& freq) {
176 24 : Filebuf file;
177 12 : if (file.Open(filename, FMODE_WriteOnly) != OK)
178 0 : return ERROR_FileOpen;
179 :
180 12 : file.sputn(NAMEBASE_MAGIC, 8);
181 :
182 : // *** Compatibility ***
183 : // Even if timeStamp is not used we still need to write these bytes
184 12 : file.WriteFourBytes(0);
185 : // ***
186 :
187 12 : ASSERT(1ULL << 24 > names_ids[NAME_PLAYER].size());
188 12 : ASSERT(1ULL << 24 > names_ids[NAME_EVENT].size());
189 12 : ASSERT(1ULL << 24 > names_ids[NAME_SITE].size());
190 12 : ASSERT(1ULL << 24 > names_ids[NAME_ROUND].size());
191 12 : file.WriteThreeBytes((uint32_t)names_ids[NAME_PLAYER].size());
192 12 : file.WriteThreeBytes((uint32_t)names_ids[NAME_EVENT].size());
193 12 : file.WriteThreeBytes((uint32_t)names_ids[NAME_SITE].size());
194 12 : file.WriteThreeBytes((uint32_t)names_ids[NAME_ROUND].size());
195 :
196 : // *** Compatibility ***
197 : // even if maxFrequency is no longer used we still need to write these bytes
198 12 : unsigned maxFreq[NUM_NAME_TYPES] = {0};
199 60 : for (nameT nt : {NAME_PLAYER, NAME_EVENT, NAME_SITE, NAME_ROUND}) {
200 48 : auto it = std::max_element(freq[nt].begin(), freq[nt].end());
201 48 : maxFreq[nt] = (it == freq[nt].end()) ? 0 : *it;
202 48 : file.WriteThreeBytes(maxFreq[nt]);
203 : }
204 : // ***
205 :
206 60 : for (nameT nt : {NAME_PLAYER, NAME_EVENT, NAME_SITE, NAME_ROUND}) {
207 48 : const char* prevName = nullptr;
208 48 : size_t numNames = names_ids[nt].size();
209 65 : for (const auto& it : names_ids[nt]) {
210 17 : const char* name = it.first;
211 17 : idNumberT id = it.second;
212 :
213 17 : ASSERT(prevName == nullptr || assert_sorted(prevName, name));
214 :
215 : // write idNumber in 2 bytes if possible, otherwise 3.
216 17 : if (numNames >= 65536) {
217 0 : file.WriteThreeBytes(id);
218 : } else {
219 17 : file.WriteTwoBytes(id);
220 : }
221 :
222 : // *** Compatibility ***
223 : // write these bytes even if they are not used anymore
224 17 : if (maxFreq[nt] >= 65536) {
225 0 : file.WriteThreeBytes(freq[nt][id]);
226 17 : } else if (maxFreq[nt] >= 256) {
227 17 : file.WriteTwoBytes(freq[nt][id]);
228 : } else {
229 0 : file.WriteOneByte(static_cast<byte>(freq[nt][id]));
230 : }
231 : // ***
232 :
233 17 : ASSERT(strlen(name) < 256);
234 17 : byte length = static_cast<byte>(strlen(name));
235 17 : file.WriteOneByte(length);
236 17 : byte prefix = 0;
237 17 : if (prevName) {
238 1 : prefix = (byte)strPrefix(name, prevName);
239 1 : file.WriteOneByte(prefix);
240 : }
241 17 : file.sputn(name + prefix, (length - prefix));
242 17 : prevName = name;
243 : }
244 : }
245 12 : return OK;
246 : }
247 :
248 : } // namespace
249 :
250 : /**
251 : * Decode SCID4 (or SCID3) data into an IndexEntry object.
252 : * @param buf_it: pointer to the buffer containing the data
253 : * (should contain INDEX_ENTRY_SIZE chars)
254 : * @param version: 400 for SCID4 or 300 for SCID3.
255 : * @param ie: pointer to the IndexEntry object where the data will be
256 : * stored.
257 : */
258 204339 : void decodeIndexEntry(const char* buf_it, versionT version, IndexEntry* ie) {
259 7969221 : auto ReadOneByte = [&buf_it]() {
260 7969221 : uint8_t res = *buf_it++;
261 7969221 : return res;
262 204339 : };
263 6947526 : auto ReadTwoBytes = [&ReadOneByte]() {
264 3473763 : uint16_t high = ReadOneByte();
265 6947526 : uint16_t res = (high << 8) | ReadOneByte();
266 3473763 : return res;
267 204339 : };
268 1226034 : auto ReadFourBytes = [&ReadTwoBytes]() {
269 613017 : uint32_t high = ReadTwoBytes();
270 1226034 : uint32_t res = (high << 16) | ReadTwoBytes();
271 613017 : return res;
272 204339 : };
273 :
274 : // Offset of the gamefile record (32 bits).
275 204339 : ie->SetOffset(ReadFourBytes());
276 :
277 : // Length of gamefile record for this game: 17 bits are used so the max
278 : // length is 128 ko (131071).
279 : // Lower bits of the extra byte are used for custom flags: LxFFFFFF ( L =
280 : // length for long games, x = spare, F = custom flags)
281 204339 : uint32_t len_Low = ReadTwoBytes();
282 204339 : uint32_t len_flags = (version < 400) ? 0 : ReadOneByte();
283 204339 : ie->SetLength(((len_flags & 0x80) << 9) | len_Low);
284 204339 : uint32_t Flags = ReadTwoBytes();
285 204339 : ie->clearFlags();
286 204339 : ie->SetFlag(((len_flags & 0x3F) << 16) | Flags, true);
287 :
288 : // WhiteID and BlackID are 20-bit values, EventID and SiteID are
289 : // 19-bit values, and RoundID is an 18-bit value.
290 : // WhiteID high 4 bits = bits 4-7 of WhiteBlack_High.
291 : // BlackID high 4 bits = bits 0-3 of WhiteBlack_High.
292 : // EventID high 3 bits = bits 5-7 of EventSiteRnd_high.
293 : // SiteID high 3 bits = bits 2-4 of EventSiteRnd_high.
294 : // RoundID high 2 bits = bits 0-1 of EventSiteRnd_high.
295 204339 : uint32_t WhiteBlack_High = ReadOneByte();
296 204339 : uint32_t WhiteID_Low = ReadTwoBytes();
297 204339 : ie->SetWhite(((WhiteBlack_High & 0xF0) << 12) | WhiteID_Low);
298 204339 : uint32_t BlackID_Low = ReadTwoBytes();
299 204339 : ie->SetBlack(((WhiteBlack_High & 0x0F) << 16) | BlackID_Low);
300 204339 : uint32_t EventSiteRnd_High = ReadOneByte();
301 204339 : uint32_t EventID_Low = ReadTwoBytes();
302 204339 : ie->SetEvent(((EventSiteRnd_High & 0xE0) << 11) | EventID_Low);
303 204339 : uint32_t SiteID_Low = ReadTwoBytes();
304 204339 : ie->SetSite(((EventSiteRnd_High & 0x1C) << 14) | SiteID_Low);
305 204339 : uint32_t RoundID_Low = ReadTwoBytes();
306 204339 : ie->SetRound(((EventSiteRnd_High & 0x03) << 16) | RoundID_Low);
307 :
308 : // Counters for comments, variations, etc. (4 bits each)
309 : // VarCounts also stores the result (4 bits).
310 204339 : uint32_t varCounts = ReadTwoBytes();
311 204339 : ie->SetRawVariationCount(varCounts & 0x0F);
312 204339 : ie->SetRawCommentCount((varCounts >> 4) & 0x0F);
313 204339 : ie->SetRawNagCount((varCounts >> 8) & 0x0F);
314 204339 : ie->SetResult((varCounts >> 12) & 0x0F);
315 :
316 : // ECO code (16 bits)
317 204339 : ie->SetEcoCode(ReadTwoBytes());
318 :
319 : // Date and EventDate are stored in four bytes.
320 : // Due to a compact encoding format, the EventDate
321 : // must be within a few years of the Date.
322 204339 : uint32_t date_edate = ReadFourBytes();
323 204339 : uint32_t date = date_edate & 0xFFFFF;
324 204339 : ie->SetDate(date);
325 204339 : uint32_t edate = date_edate >> 20;
326 204339 : uint32_t eyear = date_GetYear(edate) & 0x07;
327 204339 : if (eyear == 0) {
328 200298 : edate = ZERO_DATE;
329 : } else {
330 4041 : eyear += date_GetYear(date);
331 4041 : eyear = (eyear < 4) ? 0 : eyear - 4;
332 4041 : edate = DATE_MAKE(eyear, date_GetMonth(edate), date_GetDay(edate));
333 : }
334 204339 : ie->SetEventDate(edate);
335 :
336 : // The two ELO ratings and rating types take 2 bytes each.
337 204339 : uint16_t whiteElo = ReadTwoBytes();
338 204339 : ie->SetWhiteElo(whiteElo & 0xFFF);
339 204339 : ie->SetWhiteRatingType(whiteElo >> 12);
340 204339 : uint16_t blackElo = ReadTwoBytes();
341 204339 : ie->SetBlackElo(blackElo & 0xFFF);
342 204339 : ie->SetBlackRatingType(blackElo >> 12);
343 :
344 : // material of the final position in the game,
345 : // and the StoredLineCode in the top 8 bits.
346 204339 : uint32_t finalMatSig = ReadFourBytes();
347 204339 : ie->SetFinalMatSig(finalMatSig & 0xFFFFFF);
348 204339 : ie->SetStoredLineCode(finalMatSig >> 24);
349 :
350 : // Read the 9-byte homePawnData array:
351 : // The first byte of HomePawnData has high bits of the NumHalfMoves
352 : // counter in its top two bits:
353 204339 : uint16_t NumHalfMoves = ReadOneByte();
354 204339 : uint16_t pawnData0 = ReadOneByte();
355 204339 : ie->SetNumHalfMoves(((pawnData0 & 0xC0) << 2) | NumHalfMoves);
356 204339 : byte* pb = ie->GetHomePawnData();
357 204339 : *pb++ = pawnData0 & 0x3F;
358 204339 : std::copy_n(buf_it, HPSIG_SIZE - 1, pb);
359 204339 : }
360 :
361 16 : errorT CodecSCID4::dyn_open(fileModeT fMode, const char* filename,
362 : const Progress& progress, Index* idx,
363 : NameBase* nb) {
364 16 : if (filename == nullptr || idx == nullptr || nb == nullptr)
365 0 : return ERROR;
366 16 : if (*filename == '\0')
367 1 : return ERROR_FileOpen;
368 :
369 15 : idx_ = idx;
370 15 : nb_ = nb;
371 15 : filenames_.resize(3);
372 15 : filenames_[0] = std::string(filename) + ".si4";
373 15 : filenames_[1] = std::string(filename) + ".sn4";
374 15 : filenames_[2] = std::string(filename) + ".sg4";
375 :
376 15 : errorT err = gfile_.open(filenames_[2], fMode);
377 15 : if (err != OK)
378 1 : return err;
379 :
380 14 : if (fMode == FMODE_Create) {
381 6 : err = idx->Create(filename);
382 6 : if (err == OK) {
383 6 : err = namefileWrite(filenames_[1].c_str(), nb_->getNames(),
384 12 : idx_->calcNameFreq(*nb_));
385 : }
386 : } else {
387 8 : err = idx->Open(filename, fMode);
388 8 : if (err == OK)
389 7 : err = namefileRead(filenames_[1].c_str(), fMode, *nb_);
390 8 : if (err == OK)
391 7 : err = readIndex(progress);
392 : }
393 :
394 14 : return err;
395 : }
396 :
397 6 : errorT CodecSCID4::flush() {
398 6 : errorT err = idx_->flush();
399 6 : if (err == OK) {
400 : // *** Compatibility ***
401 : // Even if name's frequency is no longer used, it's necessary to
402 : // keep the compatibility with older Scid versions, forcing a
403 : // recalculation.
404 6 : err = namefileWrite(filenames_[1].c_str(), nb_->getNames(),
405 12 : idx_->calcNameFreq(*nb_));
406 : }
407 6 : errorT errGfile = (gfile_.pubsync() == 0) ? OK : ERROR_FileWrite;
408 :
409 6 : return (err == OK) ? errGfile : err;
410 : }
411 :
412 : /**
413 : * Reads the entire index file into memory.
414 : * Invalid name IDs are replaced with "?" if possible.
415 : * @param progress: a Progress object used for GUI communications.
416 : * @returns OK if successful or an error code.
417 : */
418 7 : inline errorT CodecSCID4::readIndex(const Progress& progress) {
419 7 : gamenumT nUnknowIDs = 0;
420 : idNumberT maxID[NUM_NAME_TYPES];
421 35 : for (nameT nt = NAME_PLAYER; nt < NUM_NAME_TYPES; nt++) {
422 28 : maxID[nt] = nb_->GetNumNames(nt);
423 : }
424 4339 : auto validateNameIDs = [&](IndexEntry* ie) {
425 21695 : if (ie->GetWhite() >= maxID[NAME_PLAYER]) {
426 0 : auto unknown = dyn_addName(NAME_PLAYER, "?");
427 0 : if (unknown.first != OK)
428 0 : return false;
429 0 : ie->SetWhite(unknown.second);
430 0 : ++nUnknowIDs;
431 : }
432 8678 : if (ie->GetBlack() >= maxID[NAME_PLAYER]) {
433 0 : auto unknown = dyn_addName(NAME_PLAYER, "?");
434 0 : if (unknown.first != OK)
435 0 : return false;
436 0 : ie->SetBlack(unknown.second);
437 0 : ++nUnknowIDs;
438 : }
439 8678 : if (ie->GetEvent() >= maxID[NAME_EVENT]) {
440 0 : auto unknown = dyn_addName(NAME_EVENT, "?");
441 0 : if (unknown.first != OK)
442 0 : return false;
443 0 : ie->SetEvent(unknown.second);
444 0 : ++nUnknowIDs;
445 : }
446 8678 : if (ie->GetSite() >= maxID[NAME_SITE]) {
447 0 : auto unknown = dyn_addName(NAME_SITE, "?");
448 0 : if (unknown.first != OK)
449 0 : return false;
450 0 : ie->SetSite(unknown.second);
451 0 : ++nUnknowIDs;
452 : }
453 8678 : if (ie->GetRound() >= maxID[NAME_ROUND]) {
454 0 : auto unknown = dyn_addName(NAME_ROUND, "?");
455 0 : if (unknown.first != OK)
456 0 : return false;
457 0 : ie->SetRound(unknown.second);
458 0 : ++nUnknowIDs;
459 : }
460 4339 : return true;
461 7 : };
462 :
463 7 : auto idxFile = idx_->FilePtr;
464 7 : auto version = idx_->Header.version;
465 7 : auto nGames = idx_->GetNumGames();
466 7 : idx_->entries_.resize(nGames);
467 :
468 7 : auto nBytes = (version < 400) ? OLD_INDEX_ENTRY_SIZE : INDEX_ENTRY_SIZE;
469 4346 : for (gamenumT gNum = 0; idxFile->sgetc() != EOF; ++gNum) {
470 4339 : if (gNum == nGames)
471 0 : return ERROR_CorruptData;
472 :
473 4339 : if ((gNum % 8192) == 0) {
474 4 : if (!progress.report(gNum, nGames))
475 0 : return ERROR_UserCancel;
476 : }
477 :
478 : char buf[INDEX_ENTRY_SIZE];
479 4339 : if (idxFile->sgetn(buf, nBytes) != nBytes)
480 0 : return ERROR_FileRead;
481 :
482 4339 : IndexEntry* ie = idx_->FetchEntry(gNum);
483 4339 : decodeIndexEntry(buf, version, ie);
484 :
485 4339 : if (!validateNameIDs(ie))
486 0 : return ERROR_CorruptData;
487 :
488 4339 : nb_->AddElo(ie->GetWhite(), ie->GetWhiteElo());
489 4339 : nb_->AddElo(ie->GetBlack(), ie->GetBlackElo());
490 : }
491 7 : progress.report(1, 1);
492 :
493 7 : if (nGames != idx_->GetNumGames())
494 0 : return ERROR_FileRead;
495 :
496 7 : idx_->nInvalidNameId_ = nUnknowIDs;
497 7 : return (nUnknowIDs == 0) ? OK : ERROR_NameDataLoss;
498 : }
|