+save_sample() {
+ # Usage: save_sample arg1
+ # Input: arg1 list_paths (list of files to take samples from)
+ # envvar BKFEH_SAMPLE_DIR (environment variable set outside of this script)
+ # envvar BKFEH_SAMPLE_SIZE (space limit for sample dir files)
+ # Depends: GNU Parallel, GNU find, GNU Coreutils 8.32 (cut, find, du)
+ # BK-2020-03: bkshuf (0.0.1), yell()
+ local list_paths
+ sample_count="100"; # max number of images to put in sample dir
+ sample_max_space="10000000"; # max bytes to put in sample dir
+
+ # Load environment variables if set
+ if [[ ! -v BKFEH_SAMPLE_DIR ]]; then return 0; fi; # return early if environment var not set.
+ if [[ -v BKFEH_SAMPLE_SIZE ]] && checkInt "$BKFEH_SAMPLE_SIZE"; then
+ sample_max_space="$BKFEH_SAMPLE_SIZE";
+ fi;
+ if [[ -v BKFEH_SAMPLE_COUNT ]] && checkInt "$BKFEH_SAMPLE_COUNT"; then
+ sample_count="$BKFEH_SAMPLE_COUNT";
+ fi;
+
+ if [[ -n "$1" ]]; then
+ list_paths="$1"; # newline-delimited list of file paths to sample from
+ else
+ yell "ERROR:NO paths available to sample.";
+ fi;
+
+ if [[ -d "$BKFEH_SAMPLE_DIR" ]]; then
+ #sample_dir="$BKFEH_SAMPLE_DIR";
+ yell "STATUS:Environment variable BKFEH_SAMPLE_DIR set. Clearing and saving samples...";
+
+ ## clear previous sample
+ count_prev_samples="$(find "$BKFEH_SAMPLE_DIR" -maxdepth 1 -type f | wc -l)";
+ yell "STATUS:Deleting $count_prev_samples previous samples...";
+ find "$BKFEH_SAMPLE_DIR" -maxdepth 1 -type f -exec rm '{}' \; ;
+
+ ## save random sample
+ yell "STATUS:Saving random sample of size $sample_count to $BKFEH_SAMPLE_DIR...";
+ list_paths_sample="$(echo "$list_paths" | bkshuf "$sample_count" | head -n"$sample_count")";
+ n_samp=0; # init sample file counter
+ sample_log="$BKFEH_SAMPLE_DIR"/paths.txt;
+ printf "%s,%s,%s\n" "n_samp" "file_hash" "file_path" >> "$sample_log";
+ while read -r line; do
+ if [[ -z "$line" ]]; then continue; fi;
+ ### check size limit
+ sample_act_space="$(du -bd1 "$BKFEH_SAMPLE_DIR" | cut -f1 )"; # actual used space
+ cand_space="$(du -bd1 "$line" | cut -f1 )"; # size of candidate file to add
+ sample_req_space="$((sample_act_space + cand_space))";
+
+ ### Customize file names
+ n_samp_w="$(printf "%s" "$sample_count" | wc -c)";
+ n_samp_fmt="%0""$n_samp_w""d";
+ n_samp_dd="$(printf "$n_samp_fmt" "$n_samp")"; # sample number fixed-width
+ file_path="$line";
+ #file_dir="$(dirname "$line")";
+ file_name="$(basename "$line")";
+ file_hash="$(b2sum -l32 "$line" | awk '{print $1}')"; # use file hash to avoid clobbering
+ file_ext="${file_name##*.}";
+ file_name="${file_name%.*}";
+ file_shortname="${file_name:0:32}";
+ file_name_new="$n_samp_dd"_"$file_hash".."$file_shortname"."$file_ext";
+ file_path_new="$BKFEH_SAMPLE_DIR"/"$file_name_new"
+ if [[ "$sample_req_space" -lt "$sample_max_space" ]]; then
+ #### add file to sample dir
+ must cp -n "$file_path" "$file_path_new";
+ #### note path in sample dir log
+ printf "%s,%s,%s\n" "$n_samp_dd" "$file_hash" "$file_path" \
+ >> "$sample_log";
+ fi;
+ ((n_samp++));
+ done < <( echo "$list_paths_sample" );
+ else
+ yell "ERROR:Does not exist: $BKFEH_SAMPLE_DIR";
+ fi;
+}; # save sample of files
+