style(unitproc/bkt-bkunzip):Treat bkunzip function as template
[BK-2020-03.git] / unitproc / bkshuf
CommitLineData
22772fe8
SBS
1#!/usr/bin/env bash
2# Desc: Mixes input lines while also preserving some neighbors
3# Usage: cat file | bkshuf arg1
57357d43 4# Version 0.1.2
22772fe8
SBS
5# Depends: bc 1.07.1, GNU Coreutils 8.32 (shuf)
6# Input: var: arg1 initial lines to output
7
b9e8b771
SBS
8# Load env vars
9## For these numbers of lines of input...
10if [[ ! -v BKSHUF_PARAM_LINEC ]]; then BKSHUF_PARAM_LINEC=1000000; fi;
11## ... target this group size.
12if [[ ! -v BKSHUF_PARAM_GSIZE ]]; then BKSHUF_PARAM_GSIZE=25; fi;
22772fe8
SBS
13
14
15yell() { echo "$0: $*" >&2; } # print script path and all args to stderr
16die() { yell "$*"; exit 111; } # same as yell() but non-zero exit status
17must() { "$@" || die "cannot $*"; } # runs args as command, reports args if command fails
18read_stdin() {
19 # Desc: Consumes stdin; outputs as stdout lines
20 # Input: stdin (consumes)
21 # Output: stdout (newline delimited)
22 # Example: printf "foo\nbar\n" | read_stdin
23 # Depends: GNU bash (version 5.1.16)
24 # Version: 0.0.1
25 local input_stdin output;
26
27 # Store stdin
28 if [[ -p /dev/stdin ]]; then
29 input_stdin="$(cat -)";
30 fi;
31
32 # Store as output array elements
33 ## Read in stdin
34 if [[ -n $input_stdin ]]; then
35 while read -r line; do
36 output+=("$line");
37 done < <(printf "%s\n" "$input_stdin");
38 fi;
39
40 # Print to stdout
41 printf "%s\n" "${output[@]}";
42}; # read stdin to stdout lines
43checkInt() {
44 # Desc: Checks if arg is integer
45 # Usage: checkInt arg
46 # Input: arg: integer
47 # Output: - return code 0 (if arg is integer)
48 # - return code 1 (if arg is not integer)
49 # Example: if ! checkInt $arg; then echo "not int"; fi;
50 # Version: 0.0.1
51 local returnState
52
53 #===Process Arg===
54 if [[ $# -ne 1 ]]; then
55 die "ERROR:Invalid number of arguments:$#";
56 fi;
57
58 RETEST1='^[0-9]+$'; # Regular Expression to test
59 if [[ ! $1 =~ $RETEST1 ]] ; then
60 returnState="false";
61 else
62 returnState="true";
63 fi;
64
65 #===Determine function return code===
66 if [ "$returnState" = "true" ]; then
67 return 0;
68 else
69 return 1;
70 fi;
71} # Checks if arg is integer
72consume_line() {
73 # Desc: Outputs and destroys line from list_input starting at index ip
74 # Usage: consume_line;
75 # Input: var: list_input array input lines
76 # var: lco int line count original
77 # var: lcr int line count remaining
78 # var: ip int list_input index pointer
79 # Output: stdout: a single non-blank element from list_input
80 # list_input one element destroyed
81 # var: lc_out incremented once
82 local n line;
83
84 n=0; # for tracking progress iterating through remaining list_input
85 ### Loop once through list_input indices until a non-blank line found
86 while [[ $n -le $lco ]]; do
87 #### check if line at ip is blank
88 line="${list_input[$ip]}";
89 if [[ -n $line ]]; then
90 ##### consume line at index ip
91 printf "%s\n" "$line" & # print to output
92 unset "list_input[$ip]"; # destroy line in list_input array
93 ((lc_out++));
94 ((lcr--)); # decrement line count remaining lcr
95 #yell "DEBUG:Consumed line ip:$ip:$line";
96 break;
97 fi;
98 #### increment input index pointer, looping around if necessary
99 if [[ $ip -le $lco ]]; then ((ip++)); else ip=0; fi;
100 #### track progress through list_input
101 ((n++));
102 done;
103}; # consume and output line in list_input array starting at index ip
104decide_read() {
105 # Desc: Decides whether to read another line in list_input array
106 # by comparing $RANDOM to p_seq_int
107 # Usage: if decide_read; then something; fi;
108 # Input: var: p_seq_int probability (int [0 32767])
109
110 if [[ $RANDOM -lt $p_seq_int ]]; then
111 return 0;
112 else
113 return 1;
114 fi;
115}; # returns 0 with probability p_seq; 1 otherwise
116
117main() {
118 declare par_l0 par_s0 s_exp s;
119 declare -a list_input;
120
121 # Check positional arguments
122 if [[ $# -gt 0 ]] && ! checkInt "$1"; then
123 die "FATAL:Not an integer:$1";
124 else
125 lc_out_max="$1"; # output line count
126 fi;
48dab430
SBS
127
128 # Check env vars
129 if ! checkInt "$BKSHUF_PARAM_LINEC"; then
130 die "FATAL:Not an int:BKSHUF_PARAM_LINEC:$BKSHUF_PARAM_LINEC"; fi;
131 if ! checkInt "$BKSHUF_PARAM_GSIZE"; then
132 die "FATAL:Not an int:BKSHUF_PARAM_LINEC:$BKSHUF_PARAM_GSIZE"; fi;
22772fe8
SBS
133
134 # store input lines from stdin (like `shuf`)
135 while read -r line; do
136 if [[ -z $line ]]; then continue; fi; # skip blank lines
137 #yell "DEBUG:INPUT:$line";
138 list_input+=("$line");
139 done < <( read_stdin; );
140
141 # calc line count (lc)
142 lc="${#list_input[@]}";
143 #lc="$(printf "%s\n" "${list_input[@]}" | wc -l )";
144 #yell "DEBUG:lc:$lc";
145 #yell "DEBUG:list_input:$(declare -p list_input)";
146
147 # calculate group size s
148 par_l0="$BKSHUF_PARAM_LINEC";
149 par_s0="$BKSHUF_PARAM_GSIZE";
150 s_exp="(( $par_s0 - 1 )/( ( l( $par_l0 ) )^2 ))*(l( $lc ))^2+1";
151 s="$(echo "scale=12; $s_exp" | bc -l)";
152
153 # calculate probabilities p_jump, p_seq
154 p_jump="$(echo "scale=12; 1 / ( $s )" | bc -l)";
155 p_seq="$(echo "scale=12; 1 - $p_jump" | bc -l)";
156 p_seq_int="$(echo "scale=0; ($p_seq * 32767)/1" | bc -l)"; # p_seq as int [0 32767] for $RANDOM range
157
158 # generate output
159 lco="${#list_input[@]}"; # save original input line count
160 lcr="$lco";
161 lc_out="0"; # init output line counter
162 if [[ -z "$lc_out_max" ]]; then lc_out_max="$lco"; fi;
163 ip="$(shuf -i0-$(( lco - 1 )) -n1)"; # init input index pointer
57357d43 164 RANDOM="$(shuf -i0-32767 -n1)"; # init Bash PRNG
22772fe8
SBS
165 n_loop1="0";
166 #yell "DEBUG:max_blanks:$max_blanks"
167 while [[ $lcr -ge 1 ]] && [[ $lc_out -lt $lc_out_max ]]; do
168 ## Select random unconsumed input line and consume it to output
169 ip="$(shuf -i0-$(( lco - 1 )) -n1)"; # init input index pointer
170 consume_line;
171
172 ## Consume the next sequential line with probability p_seq.
173 while decide_read && [[ $lcr -ge 1 ]] && [[ $lc_out -lt $lc_out_max ]]; do
174 consume_line;
175 done;
176
177 ((n_loop1++)); # increment jump counter
178 done;
179 #yell "DEBUG:n_loop1:$n_loop1"; # count jumps
180
181}; # main program
182
183main "$@";
184
185# Author: Steven Baltakatei Sandoval
186# License: GPLv3+
187
188# Dependency information
189
190# bc 1.07.1
191# Copyright 1991-1994, 1997, 1998, 2000, 2004, 2006, 2008, 2012-2017 Free Software Foundation, Inc.
192
193
194# shuf (GNU coreutils) 8.32
195# Copyright (C) 2020 Free Software Foundation, Inc.
196# License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>.
197# This is free software: you are free to change and redistribute it.
198# There is NO WARRANTY, to the extent permitted by law.
199
200# Written by Paul Eggert.
201
202