| 1 | # Date: 2020-02-21T21:23Z; |
| 2 | |
| 3 | # Author: Steven Baltakatei Sandoval |
| 4 | |
| 5 | # License: This bash script, 'bkcsvjoin.sh', is licensed under GPLv3 or |
| 6 | # later by Steven Baltakatei Sandoval: |
| 7 | # |
| 8 | # 'bkcsvjoin.sh' sorts and joins two CSV files based on first field |
| 9 | # Copyright (C) 2020 Steven Baltakatei Sandoval (baltakatei.com) |
| 10 | # |
| 11 | # This program is free software: you can redistribute it and/or modify |
| 12 | # it under the terms of the GNU General Public License as published by |
| 13 | # the Free Software Foundation, either version 3 of the License, or |
| 14 | # any later version. |
| 15 | # |
| 16 | # This program is distributed in the hope that it will be useful, |
| 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 19 | # GNU General Public License for more details. |
| 20 | # |
| 21 | # A copy of the GNU General Public License may be found at |
| 22 | # <https://www.gnu.org/licenses/>. |
| 23 | |
| 24 | # Description: Sorts and joins two CSV files based upon the contents |
| 25 | # of the first field ("field1") in each line of CSV file. It assumes |
| 26 | # the first line is a header labelling field names. It outputs several |
| 27 | # files into a new directory in the working directory. Set theory |
| 28 | # characters in output file names indicate file 1 elements with "A" |
| 29 | # and file 2 elements with "B". Input assumes UNIX-style line |
| 30 | # endings. The output files use the following numbers and definitions: |
| 31 | # |
| 32 | # 1. A⋃B The union of A and B. Contains all field1 elements from file 1 and 2. |
| 33 | # |
| 34 | # 2. A⋃(A⋂B) Contains all field1 elements of file 1 plus the matches between |
| 35 | # file 1 and 2. |
| 36 | # |
| 37 | # 3. B⋃(B⋂A) Contains all field1 elements of file 2 plus the matches between |
| 38 | # file 1 and 2. |
| 39 | # |
| 40 | # 4. A⋂B Contains only field1 elements present in both file 1 and file 2. |
| 41 | # |
| 42 | # 5. A-B Contains all field1 elements of file 1 except when a matching |
| 43 | # element is present in file 2. |
| 44 | # |
| 45 | # 6. B-A Contains all field1 elements of file 2 except when a matching |
| 46 | # element is present in file 1. |
| 47 | # |
| 48 | # 7. A⊖B Contains all field1 elements of file 1 and file 2 except for |
| 49 | # elements which are present in both file 1 and 2. |
| 50 | |
| 51 | # Usage: bkcsvjoin.sh [ file1 ] [ file2 ] > fileOut.csv |
| 52 | |
| 53 | # Dependencies: GNU Coreutils 8.30-3 (bash, join, sort, head, tail, |
| 54 | # mkdir) (see end of file for license details). Note: only tested on |
| 55 | # Debian GNU/Linux 10. |
| 56 | |
| 57 | # Version 0.0.1 |
| 58 | |
| 59 | echoerr() { echo "$@" 1>&2; } # Function for outputing text to stderr. |
| 60 | |
| 61 | SCRIPT_DATE=$(date +%Y%m%dT%H%M%S%z) |
| 62 | SCRIPT_PATH=$(pwd) |
| 63 | |
| 64 | # Create temporary working directory |
| 65 | TEMP_DIRPATH="$SCRIPT_PATH"/"$SCRIPT_DATE"..temp |
| 66 | mkdir "$TEMP_DIRPATH" |
| 67 | |
| 68 | # Read file names as input arguments |
| 69 | FILE1_PATH="$1" |
| 70 | FILE2_PATH="$2" |
| 71 | FILE1_BASENAME=$(basename "$FILE1_PATH") |
| 72 | FILE2_BASENAME=$(basename "$FILE2_PATH") |
| 73 | #[[ -f "$FILE1_PATH" ]] && echoerr "DEBUG:$FILE1_PATH is file with basename $FILE1_BASENAME." |
| 74 | #[[ -f "$FILE2_PATH" ]] && echoerr "DEBUG:$FILE2_PATH is file with basename $FILE2_BASENAME." |
| 75 | |
| 76 | # Warn and prompt if carriage return character is used. |
| 77 | if [[ $(grep -Uc $'\x0D' "$FILE1_PATH") -gt 0 ]] || [[ $(grep -Uc $'\x0D' "$FILE2_PATH") -gt 0 ]]; then |
| 78 | echoerr "ERROR: Carriage Return character(s) detected. Windows text file?" |
| 79 | read -p "Do you wish to attempt to convert DOS-style line endings to UNIX-style by removing all carriage returns? (probably okay) [y/n]" choice |
| 80 | case "$choice" in |
| 81 | y|Y ) echoerr "yes" |
| 82 | OPTION_DOS2UNIX="true" |
| 83 | ;; |
| 84 | n|N ) echoerr "no" |
| 85 | OPTION_DOS2UNIX="false" |
| 86 | echoerr "WARNING:Output may contain mix of DOS-style (CRLF) and UNIX-style (LF) line endings!" |
| 87 | echoerr " See https://stackoverflow.com/a/2613834 " |
| 88 | sleep 2 |
| 89 | ;; |
| 90 | * ) echo "Invalid selection." |
| 91 | echoerr "Exiting." && exit 1;; |
| 92 | esac |
| 93 | fi |
| 94 | |
| 95 | # Sort files on field1 and write to temporary directory. See [1], and [2]. |
| 96 | FILE1_SORTED_PATH="$TEMP_DIRPATH"/"$FILE1_BASENAME"_SORTED.csv |
| 97 | FILE2_SORTED_PATH="$TEMP_DIRPATH"/"$FILE2_BASENAME"_SORTED.csv |
| 98 | ( head -n1 $FILE1_PATH && tail -n+2 $FILE1_PATH | sort -t, -k1,1 | uniq ) > $FILE1_SORTED_PATH |
| 99 | ( head -n1 $FILE2_PATH && tail -n+2 $FILE2_PATH | sort -t, -k1,1 | uniq ) > $FILE2_SORTED_PATH |
| 100 | |
| 101 | # 1. A⋃B. Join files into Union of (file 1) + (file 2) (i.e. throwing nothing out). See [3], [6]. |
| 102 | FILES_JOINED_COMPLETE="$( join -t, -a1 -a2 -o auto --header "$FILE1_SORTED_PATH" "$FILE2_SORTED_PATH" )" |
| 103 | |
| 104 | # 2. A⋃(A⋂B). Join files, file 1 uniques only (omit unmatched field1 entries from file 2). See [6]. |
| 105 | FILES_JOINED_F1UO="$( join -t, -a1 -o auto --header "$FILE1_SORTED_PATH" "$FILE2_SORTED_PATH" )" |
| 106 | |
| 107 | # 3. B⋃(A⋂B). Join files, file 2 uniques only (omit unmatched field1 entries from file 1). See [6]. |
| 108 | FILES_JOINED_F2UO="$( join -t, -a2 -o auto --header "$FILE1_SORTED_PATH" "$FILE2_SORTED_PATH" )" |
| 109 | |
| 110 | # 4. A⋂B. Join files, only entries present in both file 1 and file 2 ( file 1 ⋂ file 2 ). See [6]. |
| 111 | FILES_JOINED_INTERSECTION="$( join -t, -o auto --header "$FILE1_SORTED_PATH" "$FILE2_SORTED_PATH" )" |
| 112 | |
| 113 | # 5. A-B. Join files into Relative Complement of (file 1) - (file 2) field1 entries. See See [5], [6]. |
| 114 | FILES_JOINED_DIFFERENCE_AB="$( join -t, -v1 -o auto --header "$FILE1_SORTED_PATH" "$FILE2_SORTED_PATH" )" |
| 115 | |
| 116 | # 6. B-A. Join files into Relative Complement of (file 2) - (file 1) field1 entries. See [5], [6]. |
| 117 | FILES_JOINED_DIFFERENCE_BA="$( join -t, -v2 -o auto --header "$FILE1_SORTED_PATH" "$FILE2_SORTED_PATH" )" |
| 118 | |
| 119 | # 7. A⊖B. Join files into Symmetric Difference of (file 2) ⊖ (file 1) field1 entries. See [5], [6]. |
| 120 | FILES_JOINED_DIFFERENCE_SYM="$( join -t, -v1 -v2 -o auto --header "$FILE1_SORTED_PATH" "$FILE2_SORTED_PATH" )" |
| 121 | |
| 122 | # Attempt to restore to UNIX-style line endings by removing all carriage return (CR) characters OPTION_DOS2UNIX set to true. See [7]. |
| 123 | if [[ $OPTION_DOS2UNIX == "true" ]]; then |
| 124 | FILES_JOINED_COMPLETE="$(echo "$FILES_JOINED_COMPLETE" | tr -d '\r')" |
| 125 | FILES_JOINED_F1UO="$(echo "$FILES_JOINED_F1UO" | tr -d '\r')" |
| 126 | FILES_JOINED_F2UO="$(echo "$FILES_JOINED_F2UO" | tr -d '\r')" |
| 127 | FILES_JOINED_INTERSECTION="$(echo "$FILES_JOINED_INTERSECTION" | tr -d '\r')" |
| 128 | FILES_JOINED_DIFFERENCE_AB="$(echo "$FILES_JOINED_DIFFERENCE_AB" | tr -d '\r')" |
| 129 | FILES_JOINED_DIFFERENCE_BA="$(echo "$FILES_JOINED_DIFFERENCE_BA" | tr -d '\r')" |
| 130 | FILES_JOINED_DIFFERENCE_SYM="$(echo "$FILES_JOINED_DIFFERENCE_SYM" | tr -d '\r')" |
| 131 | fi |
| 132 | |
| 133 | # Output CSV files to temp directory with carriage returns ('\r') removed. |
| 134 | echo "$FILES_JOINED_COMPLETE" > "$TEMP_DIRPATH"/"1.A⋃B..UNION.csv" && echoerr "1.A⋃B..UNION.csv written to $TEMP_DIRPATH" |
| 135 | echo "$FILES_JOINED_F1UO" > "$TEMP_DIRPATH"/"2.A⋃(A⋂B)..FILE-1-UNIQUES-ONLY.csv" && echoerr "2.A⋃(A⋂B)..FILE-1-UNIQUES-ONLY.csv written to $TEMP_DIRPATH" |
| 136 | echo "$FILES_JOINED_F2UO" > "$TEMP_DIRPATH"/"3.B⋃(A⋂B)..FILE-2-UNIQUES-ONLY.csv" && echoerr "3.B⋃(A⋂B)..FILE-2-UNIQUES-ONLY.csv written to $TEMP_DIRPATH" |
| 137 | echo "$FILES_JOINED_INTERSECTION" > "$TEMP_DIRPATH"/"4.A⋂B..INTERSECTION.csv" && echoerr "4.A⋂B..INTERSECTION.csv written to $TEMP_DIRPATH" |
| 138 | echo "$FILES_JOINED_DIFFERENCE_AB" > "$TEMP_DIRPATH"/"5.A-B..DIFFERENCE_A_MINUS_B.csv" && echoerr "5.A-B..DIFFERENCE_A_MINUS_B.csv written to $TEMP_DIRPATH" |
| 139 | echo "$FILES_JOINED_DIFFERENCE_BA" > "$TEMP_DIRPATH"/"6.B-A..DIFFERENCE_B_MINUS_A.csv" && echoerr "6.B-A..DIFFERENCE_B_MINUS_A.csv written to $TEMP_DIRPATH" |
| 140 | echo "$FILES_JOINED_DIFFERENCE_SYM" > "$TEMP_DIRPATH"/"7.A⊖B..DIFFERENCE_SYM.csv" && echoerr "7.A⊖B..DIFFERENCE_SYM.csv written to $TEMP_DIRPATH" |
| 141 | |
| 142 | # ==References== |
| 143 | # [1]: How to sort a csv file while retaining the first line. https://stackoverflow.com/a/14562674 |
| 144 | # [2]: How to sort a csv file by sorting on a single field. https://stackoverflow.com/a/44744800 |
| 145 | # [3]: How to join multiple sorted csv files into one. https://stackoverflow.com/a/27606199 |
| 146 | # [4]: How to join a csv file retaining the first line, sorting by certain fields, and not omitting nonmatching fields. https://superuser.com/a/1527095 |
| 147 | # [5]: How to determine symmetric difference between two sorted files. https://www.gnu.org/software/coreutils/manual/html_node/Set-operations.html |
| 148 | # [6]: Set theory symbols https://www.rapidtables.com/math/symbols/Set_Symbols.html |
| 149 | # [7]: How to convert DOS-style to UNIX-style line endings. https://stackoverflow.com/a/2613834 |
| 150 | |
| 151 | # ==Dependencies== |
| 152 | |
| 153 | # GNU bash, version 5.0.3(1)-release (x86_64-pc-linux-gnu) |
| 154 | # Copyright (C) 2019 Free Software Foundation, Inc. |
| 155 | # License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html> |
| 156 | |
| 157 | # This is free software; you are free to change and redistribute it. |
| 158 | # There is NO WARRANTY, to the extent permitted by law. |
| 159 | |
| 160 | |
| 161 | # join (GNU coreutils) 8.30 |
| 162 | # Copyright (C) 2018 Free Software Foundation, Inc. |
| 163 | # License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>. |
| 164 | # This is free software: you are free to change and redistribute it. |
| 165 | # There is NO WARRANTY, to the extent permitted by law. |
| 166 | |
| 167 | # Written by Mike Haertel. |
| 168 | |
| 169 | |
| 170 | # sort (GNU coreutils) 8.30 |
| 171 | # Copyright (C) 2018 Free Software Foundation, Inc. |
| 172 | # License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>. |
| 173 | # This is free software: you are free to change and redistribute it. |
| 174 | # There is NO WARRANTY, to the extent permitted by law. |
| 175 | |
| 176 | # Written by Mike Haertel and Paul Eggert. |
| 177 | |
| 178 | |
| 179 | # head (GNU coreutils) 8.30 |
| 180 | # Copyright (C) 2018 Free Software Foundation, Inc. |
| 181 | # License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>. |
| 182 | # This is free software: you are free to change and redistribute it. |
| 183 | # There is NO WARRANTY, to the extent permitted by law. |
| 184 | |
| 185 | # Written by David MacKenzie and Jim Meyering. |
| 186 | |
| 187 | |
| 188 | # tail (GNU coreutils) 8.30 |
| 189 | # Copyright (C) 2018 Free Software Foundation, Inc. |
| 190 | # License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>. |
| 191 | # This is free software: you are free to change and redistribute it. |
| 192 | # There is NO WARRANTY, to the extent permitted by law. |
| 193 | |
| 194 | # Written by Paul Rubin, David MacKenzie, Ian Lance Taylor, |
| 195 | # and Jim Meyering. |
| 196 | |
| 197 | |
| 198 | # mkdir (GNU coreutils) 8.30 |
| 199 | # Copyright (C) 2018 Free Software Foundation, Inc. |
| 200 | # License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>. |
| 201 | # This is free software: you are free to change and redistribute it. |
| 202 | # There is NO WARRANTY, to the extent permitted by law. |
| 203 | |
| 204 | # Written by David MacKenzie. |
| 205 | |
| 206 | # ==Metadata== |
| 207 | # - Created in task 20200220_90eb0c7d |
| 208 | |