<doc-data|<doc-title|bkshuf: A Shuf-Like Utility with Pre-Image Resistance
and Relative Order Preservation for Random Sampling of Long
Lists>|<doc-author|<author-data|<author-name|Steven Baltakatei
- Sandoval>>>|<doc-date|2023-02-14T13:48+00>|<doc-misc|CC BY-SA 4.0>>
+ Sandoval>>>|<doc-date|2023-02-14T13:56+00>|<doc-misc|CC BY-SA 4.0>>
<section|Summary>
size trends towards <math|s>.
<\eqnarray*>
- <tformat|<table|<row|<cell|p<rsub|<text|seq>>>|<cell|=>|<cell|<around*|(|1-p<rsub|<text|jump>>|)>>>|<row|<cell|p<rsub|<text|jump>>>|<cell|=>|<cell|1-p<rsub|<text|seq>>>>|<row|<cell|s>|<cell|=>|<cell|<frac|1|p<rsub|<text|jump>>>=<frac|1|1-p<rsub|<text|seq>>><eq-number>>>|<row|<cell|s>|<cell|=>|<cell|<frac|1|1-p<rsub|<text|seq>>>>>|<row|<cell|1-p<rsub|<text|seq>>>|<cell|=>|<cell|<frac|1|s>>>|<row|<cell|p<rsub|<text|seq>>-1>|<cell|=>|<cell|<frac|-1|s>>>|<row|<cell|p<rsub|<text|seq>>>|<cell|=>|<cell|1-<frac|1|s<around*|(|l<rsub|<text|in>>|)>><eq-number><inactive|<label|eq
- pseq-from-s-lin>>>>|<row|<cell|p<rsub|<text|jump>>>|<cell|=>|<cell|<frac|1|s<around*|(|l<rsub|<text|in>>|)>><eq-number><inactive|<label|eq
- pjump-from-s-lin>>>>|<row|<cell|>|<cell|>|<cell|>>|<row|<cell|p<rsub|<text|seq>><around*|(|l<rsub|<text|in>>|)>>|<cell|=>|<cell|1-<around*|[|<around*|(|<frac|s<around*|(|l<rsub|<text|in>,0>|)>-1|<around*|[|ln
- <around*|(|l<rsub|<text|in>,0>|)>|]><rsup|2>>|)>\<cdot\><around*|[|<text|ln><around*|(|l<rsub|<text|in>>|)>|]><rsup|2>+1|]><rsup|-1><eq-number><inactive|<label|eq
- pseq-from-s-lin-exp>>>>>>
+ <tformat|<table|<row|<cell|p<rsub|<text|seq>>>|<cell|=>|<cell|<around*|(|1-p<rsub|<text|jump>>|)>>>|<row|<cell|p<rsub|<text|jump>>>|<cell|=>|<cell|1-p<rsub|<text|seq>>>>|<row|<cell|s>|<cell|=>|<cell|<frac|1|p<rsub|<text|jump>>>=<frac|1|1-p<rsub|<text|seq>>><eq-number>>>|<row|<cell|s>|<cell|=>|<cell|<frac|1|1-p<rsub|<text|seq>>>>>|<row|<cell|1-p<rsub|<text|seq>>>|<cell|=>|<cell|<frac|1|s>>>|<row|<cell|p<rsub|<text|seq>>-1>|<cell|=>|<cell|<frac|-1|s>>>|<row|<cell|p<rsub|<text|seq>>>|<cell|=>|<cell|1-<frac|1|s<around*|(|l<rsub|<text|in>>|)>><eq-number><label|eq
+ pseq-from-s-lin>>>|<row|<cell|p<rsub|<text|jump>>>|<cell|=>|<cell|<frac|1|s<around*|(|l<rsub|<text|in>>|)>><eq-number><label|eq
+ pjump-from-s-lin>>>|<row|<cell|>|<cell|>|<cell|>>|<row|<cell|p<rsub|<text|seq>><around*|(|l<rsub|<text|in>>|)>>|<cell|=>|<cell|1-<around*|[|<around*|(|<frac|s<around*|(|l<rsub|<text|in>,0>|)>-1|<around*|[|ln
+ <around*|(|l<rsub|<text|in>,0>|)>|]><rsup|2>>|)>\<cdot\><around*|[|<text|ln><around*|(|l<rsub|<text|in>>|)>|]><rsup|2>+1|]><rsup|-1><eq-number><label|eq
+ pseq-from-s-lin-exp>>>>>
</eqnarray*>
<subsubsection|Jump from random variate of inverse gaussian distribution>
<\references>
<\collection>
<associate|auto-1|<tuple|1|1|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
- <associate|auto-10|<tuple|3.3.3|4|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
+ <associate|auto-10|<tuple|3.3.3|5|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
<associate|auto-11|<tuple|3.3.4|5|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
<associate|auto-12|<tuple|4|5|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
<associate|auto-13|<tuple|1|5|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
<associate|auto-14|<tuple|1|?|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
<associate|auto-2|<tuple|2|1|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
- <associate|auto-3|<tuple|3|1|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
- <associate|auto-4|<tuple|3.1|1|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
+ <associate|auto-3|<tuple|3|2|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
+ <associate|auto-4|<tuple|3.1|2|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
<associate|auto-5|<tuple|3.2|2|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
<associate|auto-6|<tuple|3.3|2|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
<associate|auto-7|<tuple|3.3.1|2|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
<associate|eq gsize-lin|<tuple|6|4|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
<associate|eq gsize-model|<tuple|3|3|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
<associate|eq gsize-param-rel|<tuple|4|3|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
+ <associate|eq pjump-from-s-lin|<tuple|11|?|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
+ <associate|eq pseq-from-s-lin|<tuple|10|?|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
+ <associate|eq pseq-from-s-lin-exp|<tuple|12|?|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
<associate|eq rel-x-lin|<tuple|1|3|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
<associate|eq rel-x0-lin0|<tuple|2|3|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
<associate|fig ex-plot-s|<tuple|1|3|../../../../../wr/20230213..bkshuf_draft/src/doc.tm>>
#!/usr/bin/env bash
# Desc: Copies random audio files
-# Usage: bk-copy-rand-music.sh [dir SOURCE] [dir DEST] [int DURATION]
-# Version: 0.0.3
+# Usage: bk-copy-rand-music [dir SOURCE] [dir DEST] [int DURATION] ([int BYTES])
+# Version: 0.1.0
+# Depends: BK-2020-03: bkshuf v0.1.0
declare -Ag appRollCall # Associative array for storing app status
declare -Ag fileRollCall # Associative array for storing file status
# Adjustable parameters
music_codecs=("vorbis" "aac" "mp3" "flac" "opus"); # whitelist of valid codec_names ffprobe might return
-max_loops="1000000"; # max number of files to test whether are audio or not
max_filename_length="255"; # max output filename length
min_file_duration="10"; # minimum duration per music file
+max_file_duration="3600"; # maximum duration per music file
+min_file_size="100000"; # minimum size per music file (bytes)
+max_file_size="100000000"; # maximum size per music file (bytes)
+siz_dest="600000000"; # default destination size limit: 600 MB
+max_find_depth="10"; # max find depth
+
yell() { echo "$0: $*" >&2; } # print script path and all args to stderr
die() { yell "$*"; exit 111; } # same as yell() but non-zero exit status
-try() { "$@" || die "cannot $*"; } # runs args as command, reports args if command fails
+must() { "$@" || die "cannot $*"; } # runs args as command, reports args if command fails
checkapp() {
# Desc: If arg is a command, save result in assoc array 'appRollCall'
# Usage: checkapp arg1 arg2 arg3 ...
audio tracks from SOURCE to DEST.
USAGE:
- bk-copy-rand-music [dir SOURCE] [dir DEST] [int DURATION]
+ bk-copy-rand-music [dir SOURCE] [dir DEST] [int DURATION] (int BYTES)
EXAMPLE:
bk-copy-rand-music ~/Music /tmp/music-sample 3600
+ bk-copy-rand-music ~/Music /tmp/music-sample 3600 680000000
DEPENDENCIES:
ffprobe
# Input: arg1: path to source tree
# arg2: path to destination tree
# arg3: cumulative duration (seconds) of audio files in destination tree
+ # arg4: cumulative size (bytes) of audio files in destination tree (optional)
# assoc arrays: appRollCall, fileRollCall, dirRollCall
+ # env.var: BKSHUF_PARAM_LINEC
+ # BKSHUF_PARAM_GSIZE
+ # arrays: music_codecs
+ # vars: max_filename_length, min_file_duration, max_file_duration,
+ # min_file_size, max_file_size, siz_dest, max_find_depth
# Output: [none]
- # Depends: yell(), checkdir() 0.1.2, displayMissing() 1.0.0, GNU Coreutils 8.30 (shuf)
- local arg1 arg2 arg3 dur_dest dir_source dir_dest list_all
+ # Depends: yell(), checkdir() 0.1.2, displayMissing() 1.0.0, GNU Coreutils 8.30
+ # BK-2020-03: bkshuf v0.1.0
+ local arg1 arg2 arg3 dur_dest dir_source dir_dest
declare -a list_files # array for files to be considered
- declare -A list_copy # assoc array for files to be copied (key=path; value=duration)
+ declare -a list_copy_sa # simple array for files to be copied (string: "$dur,$path")
# Parse args
arg1="$1";
arg2="$2";
arg3="$3";
- if [[ $# -ne 3 ]]; then showUsage; die "ERROR:Invalid number of args."; fi;
+ arg4="$4";
+ if ! ([[ $# -eq 3 ]] || [[ $# -eq 4 ]]); then showUsage; die "ERROR:Invalid number of args:$#"; fi;
## Check duration
if checkInt "$arg3"; then
else
yell "ERROR:Duration (seconds) not an int:$arg3"
fi;
+
+ ## Check size
+ if [[ -n "$arg4" ]]; then
+ if checkInt "$arg4"; then
+ siz_dest="$arg4";
+ else
+ yell "ERROR:Size (bytes) not an int:$arg4";
+ fi;
+ fi;
## Check directories
if checkdir "$arg1" "$arg2"; then
fi;
## Check apps
- checkapp ffprobe;
+ checkapp ffprobe bkshuf;
if ! displayMissing; then
showUsage;
fi;
yell "STATUS:Working...";
-
- # Generate file path list
- list_all="$(find -L "$dir_source")";
- #yell "DEBUG:list_files_rel:$list_files_rel";
- # Prune list_all of non-files and save as array list_files
+ # Populate list_files array
while read -r line; do
- #yell "DEBUG:line:$line";
- if ! [[ -f $line ]]; then
- #yell "DEBUG:Not a file:$line";
- #yell ""; # debug
- continue;
- fi;
- list_files+=("$line");
- done < <(echo "$list_all");
+ list_files+=("$line");
+ done < <(find -L "$dir_source" -maxdepth "$max_find_depth" -type f | sort);
- # Randomly test and add elements of list_files array to list_copy
+ # Test and add random elements of list_files to list_copy
dur=0; # Initialize duration
+ siz=0; # Initialize size
n=0; # Initialize loop counter
+ dur_cand_w=1; # Init duration digit width counter
+ siz_cand_w=1; # Init size digit width counter
## Get element count of list_files array
- list_files_count="${#list_files[@]}";
- while [[ $dur -le $dur_dest ]]; do
- #yell "DEBUG:list_copy building loop:$n";
- ### Select random element of list_files array
- list_files_index="$(shuf -i 1-"$list_files_count" -n1)";
- list_files_index="$((list_files_index - 1))"; # bash arrays are zero-indexed
- path_candfile="${list_files[$list_files_index]}"; # path of candidate file
+ file_count="${#list_files[@]}";
+ while read -r line && \
+ [[ $dur -le $dur_dest ]] && \
+ [[ $siz -le $siz_dest ]] && \
+ [[ $n -le $file_count ]]; do
+ #yell "DEBUG:list_copy building loop:$n";
+ path_candfile="$line"; # path of candidate file
### Check if has valid codec
if ! check_parsable_audio_ffprobe "$path_candfile"; then continue; fi; # reject
### Check and save duration
dur_cand="$(get_media_length "$path_candfile")";
dur_cand="${dur_cand%%.*}"; # convert float to int
+ if [[ "$((dur + dur_cand))" -gt "$dur_dest" ]]; then continue; fi; # reject
+ dur_cand_wnow="$(printf "%s" "$dur_cand" | wc -m)"; # duration width count
+ if [[ $dur_cand_wnow -gt $dur_cand_w ]]; then
+ dur_cand_w="$dur_cand_wnow"; fi;
if ! checkInt "$dur_cand"; then continue; fi; # reject
if [[ "$dur_cand" -lt "$min_file_duration" ]]; then continue; fi; # reject
-
- ### Add/update candfile to list_copy assoc. array (key=path; value=duration)
+ if [[ "$dur_cand" -gt "$max_file_duration" ]]; then continue; fi; # reject
+
+ ### Check and save size
+ siz_cand="$(du -b "$path_candfile" | awk '{ print $1 }')"; # size in bytes
+ siz_cand_wnow="$(printf "%s" "$siz_cand" | wc -m)"; # size width count
+ if [[ $siz_cand_wnow -gt $siz_cand_w ]]; then
+ siz_cand_w="$siz_cand_wnow"; fi;
+ if ! checkInt "$siz_cand"; then continue; fi; # reject
+ if [[ "$siz_cand" -lt "$min_file_size" ]]; then continue; fi; # reject
+ if [[ "$siz_cand" -gt "$max_file_size" ]]; then continue; fi; # reject
+
+ ### Add/update candfile to array:
+ ### list_copy_sa (simple array with only paths)
#yell "DEBUG:Adding $path_candfile";
- list_copy["$path_candfile"]="$dur_cand";
+ list_copy_sa+=("$dur_cand,$siz_cand,$path_candfile"); # for copying with order
- ### Update total duration $dur by summing all list_copy assoc. array values
- dur=0;
- for value in "${list_copy[@]}"; do
- dur="$((dur + value))";
- done;
+ ### Update total duration $dur and total size $siz
+ dur="$((dur + dur_cand))";
+ siz="$((siz + siz_cand))";
#yell "DEBUG:dur:$dur";
+ #yell "DEBUG:siz:$siz";
- ### Sanity check
((n++));
- if [[ $n -gt $max_loops ]]; then die "ERROR:Too many loops:$n"; fi;
- done;
+ done < <(printf "%s\n" "${list_files[@]}" | bkshuf);
n=0; # Initialize loop counter
+ num_w="$(printf "%s" "${#list_copy_sa[@]}" | wc -m)"; # init file number format
+ num_fmt="%0""$num_w""d";
+ path_log_output="$dir_dest"/COPY.log;
+ printf "num,fingerprint,duration,size,original_path\n" >> "$path_log_output";
# Copy files in list_copy to dir_dest;
- for key in "${!list_copy[@]}"; do
- value="${list_copy[$key]}";
- ## Get basename of path
- file_basename="$(basename "$key")";
-
- ## Get 16-character b2sum fingerprint (for different files that share basename)
- fingerprint="$(b2sum -l64 "$key" | cut -d' ' -f1)";
-
- ## Form output filename
- file_name="$fingerprint".."$file_basename";
+ while read -r line; do
+ yell "DEBUG:line:$line"; # debug
+ fdur="$(printf "%s" "$line" | cut -d',' -f1)";
+ fsize="$(printf "%s" "$line" | cut -d',' -f2)";
+ fpath="$(printf "%s" "$line" | cut -d',' -f3-)";
+ ## Get basename of path
+ file_basename="$(basename "$fpath")";
+
+ ## Get 16-character b2sum fingerprint (for different files that share basename)
+ fingerprint="$(b2sum -l32 "$fpath" | awk '{print $1}' )";
+
+ ## Form output filename
+ num="$(printf "$num_fmt" "$n")";
+ file_name="$num"_"$fingerprint".."$file_basename";
file_name="${file_name:0:$max_filename_length}"; # Limit filename length (e.g. Windows has max of 255 characters)
## Form output path
path_output="$dir_dest"/"$file_name";
## Copy
- try cp "$key" "$path_output" && yell "NOTICE:Copied ($value seconds): $key ";
+ must cp "$fpath" "$path_output" && yell "NOTICE:Copied ($fdur seconds): $fpath ";
#yell "DEBUG:Copied $file_basename to $dur_dest.";
## Append log
- path_log_output="$dir_dest"/COPY.log;
- if [[ $n -le 0 ]]; then
- echo "fingerprint","duration","original_path" >> "$path_log_output";
- else
- echo "$fingerprint","$value","$key" >> "$path_log_output";
- fi;
+ fpath_can="$(readlink -f "$fpath")"; # resolve symlinks to canonical path
+ log_fmt="%s,%s,%""$dur_cand_w""d,%""$siz_cand_w""d,%s\n"; # e.g. "%s,%3d,%5d,%s" if dur_cand_w=3 and siz_cand_w=5
+ #yell "DEBUG:log_fmt:$log_fmt"; sleep 10; # debug
+ printf "$log_fmt" "$num" "$fingerprint" "$fdur" "$fsize" "$fpath_can" >> "$path_log_output";
((n++));
unset file_basename path_output
- done;
+ done < <(printf "%s\n" "${list_copy_sa[@]}");
- # Report total duration
+ # Report total duration and size
yell "NOTICE:Total duration (seconds):$dur";
+ yell "NOTICE:Total size (bytes):$siz";
} # Main program