// SPDX-License-Identifier: MIT // SPDX-FileCopyrightText: 2023 SASANO Takayoshi #include #include #include #include #include #include #include template struct data_entry { std::wstring key; T value; }; template class data_store { public: std::map> db; void set(wchar_t *key, T value) { data_entry d; d.key = key; d.value = value; if (!db.try_emplace(d.key, d).second) db[key].value += value; }; void init(void) { db.clear(); }; }; #define CONV_BUFSIZE 65536 static data_store d; static wchar_t *find_nth_delimiter(wchar_t *str, int num) { int n = 0; for (; *str; str++) { if (*str == L' ' && ++n == num) return str; } return NULL; } static void do_line(wchar_t *buf, int n) { wchar_t *p, *q; wchar_t tmp[CONV_BUFSIZE]; p = buf; while (1) { /* no more N-gram */ if ((q = find_nth_delimiter(p, n)) == NULL) return; /* extract N-gram */ wcsncpy(tmp, p, q - p); tmp[q - p] = L'\0'; /* add candidate if it does not contains "unknown word" */ if (wcsstr(tmp, L"") == NULL) d.set(tmp, 1); /* no more next word */ if ((p = wcschr(p, L' ')) == NULL) return; p++; } } int main(int argc, char *argv[]) { FILE *fpi = stdin, *fpo = stdout; int64_t limit = 0; int ch, ngram = 0; wchar_t buf[CONV_BUFSIZE]; while ((ch = getopt(argc, argv, "l:n:")) != -1) { switch (ch) { case 'l': limit = atoll(optarg); break; case 'n': ngram = atoi(optarg); break; } } if (ngram <= 0) { fprintf(stderr, "usage: %s -n [ngram] -l [limit]\n", argv[0]); goto fin; } setlocale(LC_ALL, "ja_JP.UTF-8"); /* pass 1 */ d.init(); wcscpy(buf, L" "); while (fgetws(buf + 4, CONV_BUFSIZE - 10, fpi) != NULL) { // input line always terminates 0x20, 0x0a // (see convert_yomi.cpp) -> keep 0x20 // 0x20 after is needed for find_nth_delimiter() wcscpy(buf + wcslen(buf) - 1, L" "); do_line(buf, ngram); } /* pass 2 */ for (auto i = d.db.begin(); i != d.db.end(); i++) { if (d.db[i->first].value < limit) continue; fprintf(fpo, "%ls\t%" PRId64 "\n", d.db[i->first].key.c_str(), d.db[i->first].value); } fin: return 0; }