| 1 | #!/bin/bash |
| 2 | |
| 3 | # Function to replace contractions |
| 4 | function replace_apostrophes() { |
| 5 | # Desc: Replace ' with ʼ in contractions |
| 6 | # Note: In contractions of UTF-8 text file, replaces U+0027 |
| 7 | # APOSTROPHE with U+02BC MODIFIER LETTER APOSTROPHE |
| 8 | # Input: stdin |
| 9 | # arg1 file path |
| 10 | # Output: stdout |
| 11 | # Version: 1.0.0 (BK-2020-03) |
| 12 | # Depends: GNU sed 4.8 |
| 13 | |
| 14 | # Check input |
| 15 | if [[ "$#" -gt 1 ]]; then |
| 16 | echo "FATAL:Incorrect argument count:$#" 1>&2; |
| 17 | return 1; |
| 18 | fi; |
| 19 | |
| 20 | if [[ -f "$1" ]]; then |
| 21 | # Use specified file |
| 22 | input="$1"; |
| 23 | else |
| 24 | # Use standard input |
| 25 | input="-"; |
| 26 | fi; |
| 27 | |
| 28 | # Perform substitutions |
| 29 | ## Note: See https://en.wiktionary.org/wiki/Category:English_contractions |
| 30 | ## Note: Order of replacements sorted most-specific first. |
| 31 | sed -E \ |
| 32 | -e "s/(you|You|YOU)'(ren|REN|ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \ |
| 33 | -e "s/(you|You|YOU)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \ |
| 34 | -e "s/(you|You|YOU)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 35 | -e "s/(you|You|YOU)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \ |
| 36 | -e "s/(y|Y)'(all|ALL)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \ |
| 37 | -e "s/(y|Y)'(all|ALL)'(d|D)'(nt|NT)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \ |
| 38 | -e "s/(y|Y)'(all|ALL)'(d|D)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \ |
| 39 | -e "s/(y|Y)'(all|ALL)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 40 | -e "s/(y|Y)'(all|ALL)/\1ʼ\2/g" \ |
| 41 | -e "s/(y|Y)'(ain|AIN)'(t|T)/\1ʼ\2ʼ\3/g" \ |
| 42 | -e "s/(wouldn|Wouldn|WOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 43 | -e "s/(wouldn|Wouldn|WOULDN)'(t|T)/\1ʼ\2/g" \ |
| 44 | -e "s/(won|Won|WON)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 45 | -e "s/(won|Won|WON)'(t|T)/\1ʼ\2/g" \ |
| 46 | -e "s/(who|Who|WHO)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 47 | -e "s/(who|Who|WHO)'(d|D|ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \ |
| 48 | -e "s/(where|Where|WHERE)'(d|D|s|S)/\1ʼ\2/g" \ |
| 49 | -e "s/(what|What|WHAT)'(ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \ |
| 50 | -e "s/(weren|Weren|WEREN)'(t|T)/\1ʼ\2/g" \ |
| 51 | -e "s/(we|We|WE)'(ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \ |
| 52 | -e "s/(we|We|WE)'(ren|REN)'(t|T)/\1ʼ\2ʼ\3/g" \ |
| 53 | -e "s/(we|We|WE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 54 | -e "s/(we|We|WE)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \ |
| 55 | -e "s/(wasn|Wasn|WASN)'(t|T)/\1ʼ\2/g" \ |
| 56 | -e "s/(they|They|THEY)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 57 | -e "s/(they|They|THEY)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \ |
| 58 | -e "s/(there|There|THERE)'(ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 59 | -e "s/(there|There|THERE)'(s|S|ve|VE)/\1ʼ\2/g" \ |
| 60 | -e "s/(that|That|THAT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 61 | -e "s/(that|That|THAT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ |
| 62 | -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 63 | -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)/\1ʼ\2/g" \ |
| 64 | -e "s/(she|She|SHE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 65 | -e "s/(she|She|SHE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ |
| 66 | -e "s/(shan|Shan|SHAN)'(t|T)/\1ʼ\2/g" \ |
| 67 | -e "s/'(s\b)/ʼ\1/g" \ |
| 68 | -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 69 | -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)/\1ʼ\2/g" \ |
| 70 | -e "s/(o|O)'(clock|CLOCK)/\1ʼ\2/g" \ |
| 71 | -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \ |
| 72 | -e "s/(mustn|Mustn|MUSTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 73 | -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \ |
| 74 | -e "s/(mightn|Mightn|MIGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 75 | -e "s/(mightn|Mightn|MIGHTN)'(t|T)/\1ʼ\2/g" \ |
| 76 | -e "s/(might|Might|MIGHT)'(ve|VE)/\1ʼ\2/g" \ |
| 77 | -e "s/(let|Let|LET)'(s|S)/\1ʼ\2/g" \ |
| 78 | -e "s/(it|It|IT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 79 | -e "s/(it|It|IT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ |
| 80 | -e "s/(isn|Isn|ISN)'(t|T)/\1ʼ\2/g" \ |
| 81 | -e "s/(I|i)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \ |
| 82 | -e "s/(I|i)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 83 | -e "s/(I|i)'(d|D|ll|LL|m|M|ve|VE)/\1ʼ\2/g" \ |
| 84 | -e "s/(how|How|HOW)'(d|D)/\1ʼ\2/g" \ |
| 85 | -e "s/(he|He|HE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 86 | -e "s/(he|He|HE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ |
| 87 | -e "s/(haven|Haven|HAVEN)'(t|T)/\1ʼ\2/g" \ |
| 88 | -e "s/(hasn|Hasn|HASN)'(t|T)/\1ʼ\2/g" \ |
| 89 | -e "s/(hadn|Hadn|HADN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 90 | -e "s/(hadn|Hadn|HADN)'(t|T)/\1ʼ\2/g" \ |
| 91 | -e "s/'(em\b)/ʼ\1/g" \ |
| 92 | -e "s/(d|D)'(ya|YA|you|YOU)/\1ʼ\2/g" \ |
| 93 | -e "s/(don|Don|DON)'(t|T)/\1ʼ\2/g" \ |
| 94 | -e "s/(doesn|Doesn|DOESN)'(t|T)/\1ʼ\2/g" \ |
| 95 | -e "s/(didn|Didn|DIDN)'(t|T)/\1ʼ\2/g" \ |
| 96 | -e "s/(could|Could|COULD)'(ve|VE)/\1ʼ\2/g" \ |
| 97 | -e "s/(couldn|Couldn|COULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 98 | -e "s/(couldn|Couldn|COULDN)'(t|T)/\1ʼ\2/g" \ |
| 99 | -e "s/(c|C)'(mere|MERE)/\1ʼ\2/g" \ |
| 100 | -e "s/(can|Can|CAN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ |
| 101 | -e "s/(can|Can|CAN)'(t|T)/\1ʼ\2/g" \ |
| 102 | -e "s/'(cause|Cause|CAUSE)/ʼ\1/g" \ |
| 103 | -e "s/'(bout|Bout|BOUT)/ʼ\1/g" \ |
| 104 | -e "s/(aren|Aren|AREN)'(t|T)/\1ʼ\2/g" \ |
| 105 | -e "s/(ate|Ate|ATE)'(nt|NT)/\1ʼ\2/g" \ |
| 106 | -e "s/(ain|Ain|AIN)'(t|T)/\1ʼ\2/g" \ |
| 107 | "$input" |
| 108 | }; # replace ' with ʼ in contractions |
| 109 | |
| 110 | # Author: Steven Baltakatei Sandoval |
| 111 | # License: GPLv3+ |