feat(unitproc/bkt-replace_contractions):Use backreferences
authorSteven Baltakatei Sandoval <baltakatei@gmail.com>
Thu, 25 Jan 2024 22:59:02 +0000 (22:59 +0000)
committerSteven Baltakatei Sandoval <baltakatei@gmail.com>
Thu, 25 Jan 2024 22:59:02 +0000 (22:59 +0000)
- Note: Various more contractions added. They won't cover something
crazy like Mark Twain's novels that attempt to model accents with
copious apostrophe use, but the script should cover most commonly used
contractions in almost scholarly publications.

unitproc/bkt-replace_contractions

index fe1d5d61111f1240d4da742e2e228a6d4792986d..94731e381c7a2b09070262c45f4811220f1ff9ae 100644 (file)
@@ -8,7 +8,7 @@ replace_contractions() {
     # Input: stdin  
     #        arg1   file path
     # Output: stdout
-    # Version: 0.0.1
+    # Version: 0.1.0
     # Depends: GNU sed 4.8
 
     # Check input
@@ -26,26 +26,75 @@ replace_contractions() {
     fi;
 
     # Perform substitutions
+    ## Note: See https://en.wiktionary.org/wiki/Category:English_contractions
+    ## Note: Order of replacements sorted most-specific first.
     sed -E \
-        -e "s/(you're|You're|YOU'RE)/youʼre/gI" \
-        -e "s/(i'm|I'm|I'M)/Iʼm/gI" \
-        -e "s/(you've|You've|YOU'VE)/youʼve/gI" \
-        -e "s/(they're|They're|THEY'RE)/theyʼre/gI" \
-        -e "s/(we're|We're|WE'RE)/weʼre/gI" \
-        -e "s/(they've|They've|THEY'VE)/theyʼve/gI" \
-        -e "s/(we've|We've|WE'VE)/weʼve/gI" \
-        -e "s/(i've|I've|I'VE)/Iʼve/gI" \
-        -e "s/(that's|That's|THAT'S)/thatʼs/gI" \
-        -e "s/(what's|What's|WHAT'S)/whatʼs/gI" \
-        -e "s/(here's|Here's|HERE'S)/hereʼs/gI" \
-        -e "s/(there's|There's|THERE'S)/thereʼs/gI" \
-        -e "s/(where's|Where's|WHERE'S)/whereʼs/gI" \
-        -e "s/(who's|Who's|WHO'S)/whoʼs/gI" \
-        -e "s/(how's|How's|HOW'S)/howʼs/gI" \
-        -e "s/(doesn't|Doesn't|DOESN'T)/doesnʼt/gI" \
-        -e "s/(don't|Don't|DON'T)/donʼt/gI" \
-        -e "s/(i'll|I'll|I'LL)/Iʼll/gI" \
-        -e "s/(we'll|We'll|WE'LL)/weʼll/gI" \
-        -e "s/(they'll|They'll|THEY'LL)/theyʼll/gI" \
-        "$input";
+        -e "s/(you|You|YOU)'(ren|REN|ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \
+        -e "s/(you|You|YOU)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \
+        -e "s/(you|You|YOU)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(you|You|YOU)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \
+        -e "s/(y|Y)'(all|ALL)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \
+        -e "s/(y|Y)'(all|ALL)'(d|D)'(nt|NT)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \
+        -e "s/(y|Y)'(all|ALL)'(d|D)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \
+        -e "s/(y|Y)'(all|ALL)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(y|Y)'(all|ALL)/\1ʼ\2/g" \
+        -e "s/(y|Y)'(ain|AIN)'(t|T)/\1ʼ\2ʼ\3/g" \
+        -e "s/(wouldn|Wouldn|WOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(wouldn|Wouldn|WOULDN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(won|Won|WON)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(won|Won|WON)'(t|T)/\1ʼ\2/g" \
+        -e "s/(who|Who|WHO)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(who|Who|WHO)'(d|D|ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \
+        -e "s/(where|Where|WHERE)'(s|S)/\1ʼ\2/g" \
+        -e "s/(what|What|WHAT)'(ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \
+        -e "s/(weren|Weren|WEREN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(we|We|WE)'(ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \
+        -e "s/(we|We|WE)'(ren|REN)'(t|T)/\1ʼ\2ʼ\3/g" \
+        -e "s/(we|We|WE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(we|We|WE)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \
+        -e "s/(they|They|THEY)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(they|They|THEY)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \
+        -e "s/(there|There|THERE)'(ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(there|There|THERE)'(s|S|ve|VE)/\1ʼ\2/g" \
+        -e "s/(that|That|THAT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(that|That|THAT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \
+        -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(she|She|SHE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(she|She|SHE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \
+        -e "s/(shan|Shan|SHAN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(mustn|Mustn|MUSTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(mightn|Mightn|MIGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(mightn|Mightn|MIGHTN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(might|Might|MIGHT)'(ve|VE)/\1ʼ\2/g" \
+        -e "s/(let|Let|LET)'(s|S)/\1ʼ\2/g" \
+        -e "s/(it|It|IT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(it|It|IT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \
+        -e "s/(isn|Isn|ISN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(I|i)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \
+        -e "s/(I|i)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(I|i)'(d|D|ll|LL|m|M|ve|VE)/\1ʼ\2/g" \
+        -e "s/(he|He|HE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(he|He|HE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \
+        -e "s/(haven|Haven|HAVEN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(hasn|Hasn|HASN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(hadn|Hadn|HADN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(hadn|Hadn|HADN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(d|D)'(ya|YA|you|YOU)/\1ʼ\2/g" \
+        -e "s/(don|Don|DON)'(t|T)/\1ʼ\2/g" \
+        -e "s/(doesn|Doesn|DOESN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(didn|Didn|DIDN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(could|Could|COULD)'(ve|VE)/\1ʼ\2/g" \
+        -e "s/(couldn|Couldn|COULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(couldn|Couldn|COULDN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(can|Can|CAN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(can|Can|CAN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(aren|Aren|AREN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(ate|Ate|ATE)'(nt|NT)/\1ʼ\2/g" \
+        -e "s/(ain|Ain|AIN)'(t|T)/\1ʼ\2/g" \
+        "$input"
 }; # replace ' with ʼ in contractions