From: Steven Baltakatei Sandoval Date: Thu, 25 Jan 2024 22:59:02 +0000 (+0000) Subject: feat(unitproc/bkt-replace_contractions):Use backreferences X-Git-Url: https://zdv2.bktei.com/gitweb/BK-2020-03.git/commitdiff_plain/7e653610fa473edda70e211a3daf47ef69a2e834?ds=sidebyside;hp=265d3d9a134fcb78054560278b5d9b6d1f6396bd feat(unitproc/bkt-replace_contractions):Use backreferences - Note: Various more contractions added. They won't cover something crazy like Mark Twain's novels that attempt to model accents with copious apostrophe use, but the script should cover most commonly used contractions in almost scholarly publications. --- diff --git a/unitproc/bkt-replace_contractions b/unitproc/bkt-replace_contractions index fe1d5d6..94731e3 100644 --- a/unitproc/bkt-replace_contractions +++ b/unitproc/bkt-replace_contractions @@ -8,7 +8,7 @@ replace_contractions() { # Input: stdin # arg1 file path # Output: stdout - # Version: 0.0.1 + # Version: 0.1.0 # Depends: GNU sed 4.8 # Check input @@ -26,26 +26,75 @@ replace_contractions() { fi; # Perform substitutions + ## Note: See https://en.wiktionary.org/wiki/Category:English_contractions + ## Note: Order of replacements sorted most-specific first. sed -E \ - -e "s/(you're|You're|YOU'RE)/youʼre/gI" \ - -e "s/(i'm|I'm|I'M)/Iʼm/gI" \ - -e "s/(you've|You've|YOU'VE)/youʼve/gI" \ - -e "s/(they're|They're|THEY'RE)/theyʼre/gI" \ - -e "s/(we're|We're|WE'RE)/weʼre/gI" \ - -e "s/(they've|They've|THEY'VE)/theyʼve/gI" \ - -e "s/(we've|We've|WE'VE)/weʼve/gI" \ - -e "s/(i've|I've|I'VE)/Iʼve/gI" \ - -e "s/(that's|That's|THAT'S)/thatʼs/gI" \ - -e "s/(what's|What's|WHAT'S)/whatʼs/gI" \ - -e "s/(here's|Here's|HERE'S)/hereʼs/gI" \ - -e "s/(there's|There's|THERE'S)/thereʼs/gI" \ - -e "s/(where's|Where's|WHERE'S)/whereʼs/gI" \ - -e "s/(who's|Who's|WHO'S)/whoʼs/gI" \ - -e "s/(how's|How's|HOW'S)/howʼs/gI" \ - -e "s/(doesn't|Doesn't|DOESN'T)/doesnʼt/gI" \ - -e "s/(don't|Don't|DON'T)/donʼt/gI" \ - -e "s/(i'll|I'll|I'LL)/Iʼll/gI" \ - -e "s/(we'll|We'll|WE'LL)/weʼll/gI" \ - -e "s/(they'll|They'll|THEY'LL)/theyʼll/gI" \ - "$input"; + -e "s/(you|You|YOU)'(ren|REN|ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(you|You|YOU)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \ + -e "s/(you|You|YOU)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(you|You|YOU)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \ + -e "s/(y|Y)'(all|ALL)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \ + -e "s/(y|Y)'(all|ALL)'(d|D)'(nt|NT)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \ + -e "s/(y|Y)'(all|ALL)'(d|D)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \ + -e "s/(y|Y)'(all|ALL)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(y|Y)'(all|ALL)/\1ʼ\2/g" \ + -e "s/(y|Y)'(ain|AIN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(wouldn|Wouldn|WOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(wouldn|Wouldn|WOULDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(won|Won|WON)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(won|Won|WON)'(t|T)/\1ʼ\2/g" \ + -e "s/(who|Who|WHO)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(who|Who|WHO)'(d|D|ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \ + -e "s/(where|Where|WHERE)'(s|S)/\1ʼ\2/g" \ + -e "s/(what|What|WHAT)'(ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \ + -e "s/(weren|Weren|WEREN)'(t|T)/\1ʼ\2/g" \ + -e "s/(we|We|WE)'(ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(we|We|WE)'(ren|REN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(we|We|WE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(we|We|WE)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \ + -e "s/(they|They|THEY)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(they|They|THEY)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \ + -e "s/(there|There|THERE)'(ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(there|There|THERE)'(s|S|ve|VE)/\1ʼ\2/g" \ + -e "s/(that|That|THAT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(that|That|THAT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(she|She|SHE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(she|She|SHE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(shan|Shan|SHAN)'(t|T)/\1ʼ\2/g" \ + -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(mustn|Mustn|MUSTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(mightn|Mightn|MIGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(mightn|Mightn|MIGHTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(might|Might|MIGHT)'(ve|VE)/\1ʼ\2/g" \ + -e "s/(let|Let|LET)'(s|S)/\1ʼ\2/g" \ + -e "s/(it|It|IT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(it|It|IT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(isn|Isn|ISN)'(t|T)/\1ʼ\2/g" \ + -e "s/(I|i)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \ + -e "s/(I|i)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(I|i)'(d|D|ll|LL|m|M|ve|VE)/\1ʼ\2/g" \ + -e "s/(he|He|HE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(he|He|HE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(haven|Haven|HAVEN)'(t|T)/\1ʼ\2/g" \ + -e "s/(hasn|Hasn|HASN)'(t|T)/\1ʼ\2/g" \ + -e "s/(hadn|Hadn|HADN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(hadn|Hadn|HADN)'(t|T)/\1ʼ\2/g" \ + -e "s/(d|D)'(ya|YA|you|YOU)/\1ʼ\2/g" \ + -e "s/(don|Don|DON)'(t|T)/\1ʼ\2/g" \ + -e "s/(doesn|Doesn|DOESN)'(t|T)/\1ʼ\2/g" \ + -e "s/(didn|Didn|DIDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(could|Could|COULD)'(ve|VE)/\1ʼ\2/g" \ + -e "s/(couldn|Couldn|COULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(couldn|Couldn|COULDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(can|Can|CAN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(can|Can|CAN)'(t|T)/\1ʼ\2/g" \ + -e "s/(aren|Aren|AREN)'(t|T)/\1ʼ\2/g" \ + -e "s/(ate|Ate|ATE)'(nt|NT)/\1ʼ\2/g" \ + -e "s/(ain|Ain|AIN)'(t|T)/\1ʼ\2/g" \ + "$input" }; # replace ' with ʼ in contractions