Merge branch 'develop' of https://zdv2.bktei.com/gitweb/BK-2020-03 into develop
[BK-2020-03.git] / unitproc / bkfind
... / ...
CommitLineData
1#!/bin/bash
2
3# Date: 2020-01-20T17:08Z
4#
5# Author: Steven Baltakatei Sandoval (baltakatei.com)
6#
7# License: This bash script, `bkfind`, is licensed under GPLv3 or
8# later by Steven Baltakatei Sandoval:
9#
10# `bkfind`, a duplicate file finder
11# Copyright (C) 2020 Steven Baltakatei Sandoval (baltakatei.com)
12#
13# This program is free software: you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation, either version 3 of the License, or
16# any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# A copy of the GNU General Public License may be found at
24# <https://www.gnu.org/licenses/>.
25#
26# Description: This is a script that searches a specified directory
27# for files with a file name containing a specified string. It works
28# as follows:
29#
30# - Search specified directory tree for files that have filenames
31# that contain the specified file's filename. List groups of files
32# sharing the same hash first then list files with unique hashes.
33#
34# Dependencies: find, rhash, uniq, cut, cat, bash. See end of file
35#
36# Tested on:
37#
38# - GNU/Linux Debian 10
39#
40
41
42#==Initialization==
43# Use input arguments to define internal script variables.
44DIR1="$1" # Specified directory
45FILE1="$2" # Specified file
46DUPLICATES1=""
47DUPLICATES2=""
48DUPLICATES3=""
49UNIQUES1=""
50UNIQUES2=""
51UNIQUES3=""
52RHASH_HASH_TYPE="sha512"
53HASH_DISP_LENGTH=16
54let HASH_CHAR_LENGTH="512 / 4" # The number of characters returned by the chosen hash function (ex: `rhash --sha512 {}` produces 512/4=128 hexadecimal chars)
55
56# Strip path information from provided file name.
57FILEBASE1=$(basename "$FILE1")
58
59
60#==Main Program==
61# Generate list of sha512 hashes and filepaths, save to $HASHLIST1
62HASHLIST1="$(find $DIR1 -type f -iname "*$FILEBASE1*" -exec rhash --"$RHASH_HASH_TYPE" {} \;)"
63
64# Specify character position before which characters are dropped from each line with `cut`.
65let CUT_POSITION="1 + $HASH_CHAR_LENGTH - $HASH_DISP_LENGTH"
66
67#====Files with duplicate hashes====
68# Generate sublist of duplicate entries, save to $DUPLICATES1
69DUPLICATES1="$(echo -e "$HASHLIST1" | sort | uniq -D --check-chars=128)"
70
71# Format $DUPLICATES1 for readability by grouping, truncating sha512 hash; save to $DUPLICATES2
72DUPLICATES2="$(echo -e "$DUPLICATES1" | uniq --check-chars=128 --group | cut --characters=$CUT_POSITION-)"
73
74#====Files with unique hashes====
75# Generate sublist of unique entries, save to $UNIQUES1
76UNIQUES1="$(echo -e "$HASHLIST1" | sort | uniq --unique --check-chars=128)"
77
78# Format $UNIQUES1 for readability by truncating sha512 hash; save to $UNIQUES2
79UNIQUES2="$(echo -e "$UNIQUES1" | cut --characters=$CUT_POSITION-)"
80
81
82# List results
83echo -e "$DUPLICATES2"
84echo -e "$UNIQUES2"
85
86# Dependencies:
87#
88# - find (GNU findutils) 4.6.0.225-235f
89# Copyright (C) 2019 Free Software Foundation, Inc.
90# License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>.
91# Written by Eric B. Decker, James Youngman, and Kevin Dalley.
92#
93# - RHash v1.3.8
94# License: RHash License <http://rhash.sourceforge.net/license.php>
95#
96# - uniq (GNU coreutils) 8.30
97# Copyright (C) 2018 Free Software Foundation, Inc.
98# License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>.
99# This is free software: you are free to change and redistribute it.
100# There is NO WARRANTY, to the extent permitted by law.
101# Written by Richard M. Stallman and David MacKenzie.
102#
103# - cut (GNU coreutils) 8.30
104# Copyright (C) 2018 Free Software Foundation, Inc.
105# License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>.
106# This is free software: you are free to change and redistribute it.
107# There is NO WARRANTY, to the extent permitted by law.
108# Written by David M. Ihnat, David MacKenzie, and Jim Meyering.
109#
110# - cat (GNU coreutils) 8.30
111# Copyright (C) 2018 Free Software Foundation, Inc.
112# License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>.
113# This is free software: you are free to change and redistribute it.
114# There is NO WARRANTY, to the extent permitted by law.
115# Written by Torbjorn Granlund and Richard M. Stallman.
116#
117# - GNU bash, version 5.0.3(1)-release (x86_64-pc-linux-gnu)
118# Copyright (C) 2019 Free Software Foundation, Inc.
119# License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
120# This is free software; you are free to change and redistribute it.
121# There is NO WARRANTY, to the extent permitted by law.