From: Steven Baltakatei Sandoval Date: Thu, 25 Jan 2024 22:59:02 +0000 (+0000) Subject: feat(unitproc/bkt-replace_contractions):Use backreferences X-Git-Url: https://zdv2.bktei.com/gitweb/BK-2020-03.git/commitdiff_plain/7e653610fa473edda70e211a3daf47ef69a2e834?ds=inline feat(unitproc/bkt-replace_contractions):Use backreferences - Note: Various more contractions added. They won't cover something crazy like Mark Twain's novels that attempt to model accents with copious apostrophe use, but the script should cover most commonly used contractions in almost scholarly publications. --- diff --git a/unitproc/bkt-replace_contractions b/unitproc/bkt-replace_contractions index fe1d5d6..94731e3 100644 --- a/unitproc/bkt-replace_contractions +++ b/unitproc/bkt-replace_contractions @@ -8,7 +8,7 @@ replace_contractions() { # Input: stdin # arg1 file path # Output: stdout - # Version: 0.0.1 + # Version: 0.1.0 # Depends: GNU sed 4.8 # Check input @@ -26,26 +26,75 @@ replace_contractions() { fi; # Perform substitutions + ## Note: See https://en.wiktionary.org/wiki/Category:English_contractions + ## Note: Order of replacements sorted most-specific first. sed -E \ - -e "s/(you're|You're|YOU'RE)/youʼre/gI" \ - -e "s/(i'm|I'm|I'M)/Iʼm/gI" \ - -e "s/(you've|You've|YOU'VE)/youʼve/gI" \ - -e "s/(they're|They're|THEY'RE)/theyʼre/gI" \ - -e "s/(we're|We're|WE'RE)/weʼre/gI" \ - -e "s/(they've|They've|THEY'VE)/theyʼve/gI" \ - -e "s/(we've|We've|WE'VE)/weʼve/gI" \ - -e "s/(i've|I've|I'VE)/Iʼve/gI" \ - -e "s/(that's|That's|THAT'S)/thatʼs/gI" \ - -e "s/(what's|What's|WHAT'S)/whatʼs/gI" \ - -e "s/(here's|Here's|HERE'S)/hereʼs/gI" \ - -e "s/(there's|There's|THERE'S)/thereʼs/gI" \ - -e "s/(where's|Where's|WHERE'S)/whereʼs/gI" \ - -e "s/(who's|Who's|WHO'S)/whoʼs/gI" \ - -e "s/(how's|How's|HOW'S)/howʼs/gI" \ - -e "s/(doesn't|Doesn't|DOESN'T)/doesnʼt/gI" \ - -e "s/(don't|Don't|DON'T)/donʼt/gI" \ - -e "s/(i'll|I'll|I'LL)/Iʼll/gI" \ - -e "s/(we'll|We'll|WE'LL)/weʼll/gI" \ - -e "s/(they'll|They'll|THEY'LL)/theyʼll/gI" \ - "$input"; + -e "s/(you|You|YOU)'(ren|REN|ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(you|You|YOU)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \ + -e "s/(you|You|YOU)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(you|You|YOU)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \ + -e "s/(y|Y)'(all|ALL)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \ + -e "s/(y|Y)'(all|ALL)'(d|D)'(nt|NT)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \ + -e "s/(y|Y)'(all|ALL)'(d|D)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \ + -e "s/(y|Y)'(all|ALL)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(y|Y)'(all|ALL)/\1ʼ\2/g" \ + -e "s/(y|Y)'(ain|AIN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(wouldn|Wouldn|WOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(wouldn|Wouldn|WOULDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(won|Won|WON)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(won|Won|WON)'(t|T)/\1ʼ\2/g" \ + -e "s/(who|Who|WHO)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(who|Who|WHO)'(d|D|ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \ + -e "s/(where|Where|WHERE)'(s|S)/\1ʼ\2/g" \ + -e "s/(what|What|WHAT)'(ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \ + -e "s/(weren|Weren|WEREN)'(t|T)/\1ʼ\2/g" \ + -e "s/(we|We|WE)'(ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(we|We|WE)'(ren|REN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(we|We|WE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(we|We|WE)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \ + -e "s/(they|They|THEY)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(they|They|THEY)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \ + -e "s/(there|There|THERE)'(ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(there|There|THERE)'(s|S|ve|VE)/\1ʼ\2/g" \ + -e "s/(that|That|THAT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(that|That|THAT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(she|She|SHE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(she|She|SHE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(shan|Shan|SHAN)'(t|T)/\1ʼ\2/g" \ + -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(mustn|Mustn|MUSTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(mightn|Mightn|MIGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(mightn|Mightn|MIGHTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(might|Might|MIGHT)'(ve|VE)/\1ʼ\2/g" \ + -e "s/(let|Let|LET)'(s|S)/\1ʼ\2/g" \ + -e "s/(it|It|IT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(it|It|IT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(isn|Isn|ISN)'(t|T)/\1ʼ\2/g" \ + -e "s/(I|i)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \ + -e "s/(I|i)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(I|i)'(d|D|ll|LL|m|M|ve|VE)/\1ʼ\2/g" \ + -e "s/(he|He|HE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(he|He|HE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(haven|Haven|HAVEN)'(t|T)/\1ʼ\2/g" \ + -e "s/(hasn|Hasn|HASN)'(t|T)/\1ʼ\2/g" \ + -e "s/(hadn|Hadn|HADN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(hadn|Hadn|HADN)'(t|T)/\1ʼ\2/g" \ + -e "s/(d|D)'(ya|YA|you|YOU)/\1ʼ\2/g" \ + -e "s/(don|Don|DON)'(t|T)/\1ʼ\2/g" \ + -e "s/(doesn|Doesn|DOESN)'(t|T)/\1ʼ\2/g" \ + -e "s/(didn|Didn|DIDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(could|Could|COULD)'(ve|VE)/\1ʼ\2/g" \ + -e "s/(couldn|Couldn|COULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(couldn|Couldn|COULDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(can|Can|CAN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(can|Can|CAN)'(t|T)/\1ʼ\2/g" \ + -e "s/(aren|Aren|AREN)'(t|T)/\1ʼ\2/g" \ + -e "s/(ate|Ate|ATE)'(nt|NT)/\1ʼ\2/g" \ + -e "s/(ain|Ain|AIN)'(t|T)/\1ʼ\2/g" \ + "$input" }; # replace ' with ʼ in contractions