feat(unitproc/bkshuf):Add bash script like shuf
[BK-2020-03.git] / unitproc / bkshuf
CommitLineData
22772fe8
SBS
1#!/usr/bin/env bash
2# Desc: Mixes input lines while also preserving some neighbors
3# Usage: cat file | bkshuf arg1
4# Version 0.0.1
5# Depends: bc 1.07.1, GNU Coreutils 8.32 (shuf)
6# Input: var: arg1 initial lines to output
7
8BKSHUF_PARAM_LINEC=1000000;
9BKSHUF_PARAM_GSIZE=25 # lines per group for BKSHUF_PARAM_LINEC lines of input
10
11
12yell() { echo "$0: $*" >&2; } # print script path and all args to stderr
13die() { yell "$*"; exit 111; } # same as yell() but non-zero exit status
14must() { "$@" || die "cannot $*"; } # runs args as command, reports args if command fails
15read_stdin() {
16 # Desc: Consumes stdin; outputs as stdout lines
17 # Input: stdin (consumes)
18 # Output: stdout (newline delimited)
19 # Example: printf "foo\nbar\n" | read_stdin
20 # Depends: GNU bash (version 5.1.16)
21 # Version: 0.0.1
22 local input_stdin output;
23
24 # Store stdin
25 if [[ -p /dev/stdin ]]; then
26 input_stdin="$(cat -)";
27 fi;
28
29 # Store as output array elements
30 ## Read in stdin
31 if [[ -n $input_stdin ]]; then
32 while read -r line; do
33 output+=("$line");
34 done < <(printf "%s\n" "$input_stdin");
35 fi;
36
37 # Print to stdout
38 printf "%s\n" "${output[@]}";
39}; # read stdin to stdout lines
40checkInt() {
41 # Desc: Checks if arg is integer
42 # Usage: checkInt arg
43 # Input: arg: integer
44 # Output: - return code 0 (if arg is integer)
45 # - return code 1 (if arg is not integer)
46 # Example: if ! checkInt $arg; then echo "not int"; fi;
47 # Version: 0.0.1
48 local returnState
49
50 #===Process Arg===
51 if [[ $# -ne 1 ]]; then
52 die "ERROR:Invalid number of arguments:$#";
53 fi;
54
55 RETEST1='^[0-9]+$'; # Regular Expression to test
56 if [[ ! $1 =~ $RETEST1 ]] ; then
57 returnState="false";
58 else
59 returnState="true";
60 fi;
61
62 #===Determine function return code===
63 if [ "$returnState" = "true" ]; then
64 return 0;
65 else
66 return 1;
67 fi;
68} # Checks if arg is integer
69consume_line() {
70 # Desc: Outputs and destroys line from list_input starting at index ip
71 # Usage: consume_line;
72 # Input: var: list_input array input lines
73 # var: lco int line count original
74 # var: lcr int line count remaining
75 # var: ip int list_input index pointer
76 # Output: stdout: a single non-blank element from list_input
77 # list_input one element destroyed
78 # var: lc_out incremented once
79 local n line;
80
81 n=0; # for tracking progress iterating through remaining list_input
82 ### Loop once through list_input indices until a non-blank line found
83 while [[ $n -le $lco ]]; do
84 #### check if line at ip is blank
85 line="${list_input[$ip]}";
86 if [[ -n $line ]]; then
87 ##### consume line at index ip
88 printf "%s\n" "$line" & # print to output
89 unset "list_input[$ip]"; # destroy line in list_input array
90 ((lc_out++));
91 ((lcr--)); # decrement line count remaining lcr
92 #yell "DEBUG:Consumed line ip:$ip:$line";
93 break;
94 fi;
95 #### increment input index pointer, looping around if necessary
96 if [[ $ip -le $lco ]]; then ((ip++)); else ip=0; fi;
97 #### track progress through list_input
98 ((n++));
99 done;
100}; # consume and output line in list_input array starting at index ip
101decide_read() {
102 # Desc: Decides whether to read another line in list_input array
103 # by comparing $RANDOM to p_seq_int
104 # Usage: if decide_read; then something; fi;
105 # Input: var: p_seq_int probability (int [0 32767])
106
107 if [[ $RANDOM -lt $p_seq_int ]]; then
108 return 0;
109 else
110 return 1;
111 fi;
112}; # returns 0 with probability p_seq; 1 otherwise
113
114main() {
115 declare par_l0 par_s0 s_exp s;
116 declare -a list_input;
117
118 # Check positional arguments
119 if [[ $# -gt 0 ]] && ! checkInt "$1"; then
120 die "FATAL:Not an integer:$1";
121 else
122 lc_out_max="$1"; # output line count
123 fi;
124
125
126 # store input lines from stdin (like `shuf`)
127 while read -r line; do
128 if [[ -z $line ]]; then continue; fi; # skip blank lines
129 #yell "DEBUG:INPUT:$line";
130 list_input+=("$line");
131 done < <( read_stdin; );
132
133 # calc line count (lc)
134 lc="${#list_input[@]}";
135 #lc="$(printf "%s\n" "${list_input[@]}" | wc -l )";
136 #yell "DEBUG:lc:$lc";
137 #yell "DEBUG:list_input:$(declare -p list_input)";
138
139 # calculate group size s
140 par_l0="$BKSHUF_PARAM_LINEC";
141 par_s0="$BKSHUF_PARAM_GSIZE";
142 s_exp="(( $par_s0 - 1 )/( ( l( $par_l0 ) )^2 ))*(l( $lc ))^2+1";
143 s="$(echo "scale=12; $s_exp" | bc -l)";
144
145 # calculate probabilities p_jump, p_seq
146 p_jump="$(echo "scale=12; 1 / ( $s )" | bc -l)";
147 p_seq="$(echo "scale=12; 1 - $p_jump" | bc -l)";
148 p_seq_int="$(echo "scale=0; ($p_seq * 32767)/1" | bc -l)"; # p_seq as int [0 32767] for $RANDOM range
149
150 # generate output
151 lco="${#list_input[@]}"; # save original input line count
152 lcr="$lco";
153 lc_out="0"; # init output line counter
154 if [[ -z "$lc_out_max" ]]; then lc_out_max="$lco"; fi;
155 ip="$(shuf -i0-$(( lco - 1 )) -n1)"; # init input index pointer
156 n_loop1="0";
157 #yell "DEBUG:max_blanks:$max_blanks"
158 while [[ $lcr -ge 1 ]] && [[ $lc_out -lt $lc_out_max ]]; do
159 ## Select random unconsumed input line and consume it to output
160 ip="$(shuf -i0-$(( lco - 1 )) -n1)"; # init input index pointer
161 consume_line;
162
163 ## Consume the next sequential line with probability p_seq.
164 while decide_read && [[ $lcr -ge 1 ]] && [[ $lc_out -lt $lc_out_max ]]; do
165 consume_line;
166 done;
167
168 ((n_loop1++)); # increment jump counter
169 done;
170 #yell "DEBUG:n_loop1:$n_loop1"; # count jumps
171
172}; # main program
173
174main "$@";
175
176# Author: Steven Baltakatei Sandoval
177# License: GPLv3+
178
179# Dependency information
180
181# bc 1.07.1
182# Copyright 1991-1994, 1997, 1998, 2000, 2004, 2006, 2008, 2012-2017 Free Software Foundation, Inc.
183
184
185# shuf (GNU coreutils) 8.32
186# Copyright (C) 2020 Free Software Foundation, Inc.
187# License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>.
188# This is free software: you are free to change and redistribute it.
189# There is NO WARRANTY, to the extent permitted by law.
190
191# Written by Paul Eggert.
192
193