| 1 | #include <unistd.h> |
| 2 | #include <stdio.h> |
| 3 | |
| 4 | /* |
| 5 | Desc: A program to print the frequency of different UTF-8 characters |
| 6 | taking into account the presence of diacritical marks. |
| 7 | Depends: glibc >2.35 |
| 8 | Info: Diacritical marks are found in Unicode blocks: |
| 9 | - U+0300-036F: Combining Diacritical Marks |
| 10 | - U+1AB0-1AFF: Combining Diacritical Marks Extended |
| 11 | - U+1DC0-1DFF: Combining Diacritical Marks Supplement |
| 12 | - U+20D0-20FF: Combining Diacritical Marks for Symbols |
| 13 | - U+FE20-FE2F: Combining Half Marks |
| 14 | Ref/Attrib: UTF-8 byte mechanics: https://www.johndcook.com/blog/2019/09/09/how-utf-8-works/ |
| 15 | */ |
| 16 | |
| 17 | int main() { |
| 18 | usleep(10000); |
| 19 | int c; |
| 20 | long nc; |
| 21 | |
| 22 | nc = 0; |
| 23 | printf("%5s,%5s,%9s\n","dec","hex","bin"); |
| 24 | while ( (c = getchar()) != EOF) { |
| 25 | printf("%5d,%5x,%9b\n",c,c,c); |
| 26 | ++nc; |
| 27 | }; |
| 28 | |
| 29 | printf("Character count:%ld\n",nc); |
| 30 | return 0; |
| 31 | }; |
| 32 | |
| 33 | /* Strategy |
| 34 | |
| 35 | - Define table of valid Navajo graphemes |
| 36 | |
| 37 | - Define map of precomposed characters and combining mark permutations |
| 38 | to graphemes in the table. |
| 39 | |
| 40 | - Read input sequentially, incrementing a list of integer counts of |
| 41 | graphemes detected. |
| 42 | |
| 43 | - Print grapheme totals. |
| 44 | |
| 45 | */ |
| 46 | |
| 47 | /* Process |
| 48 | |
| 49 | 1. Read char int into c via c = getchar() |
| 50 | 2. Detect if int c is ASCII (c within [32-126]) or multibyte (first bit 1) |
| 51 | a. If ASCII, increment nc, continue to next loop. |
| 52 | b. If multibyte, then calculate Unicode code point. |
| 53 | 3. Detect if code point falls into known combining mark ranges. |
| 54 | a. If comark, continue to next loop. |
| 55 | b. If not comark increment nc, continue to next loop. |
| 56 | |
| 57 | */ |
| 58 | |
| 59 | // Author: Steven Baltakatei Sandoval |
| 60 | // License: GPLv3+ |