feat(user/transcribe_whisper.sh):Shuffle input files
[BK-2020-03.git] / user / transcribe_whisper.sh
1 #!/bin/bash
2 # Desc: Runs OpenAI Whisper on working directory media files
3 # Usage: ./transcribe_whisper.sh [dir] 3
4 # Input: arg1 input dir
5 # arg2 CUDA graphics card number (zero-indexed)
6 # Version: 0.4.0
7 # Depends: whisper ( https://github.com/openai/whisper )
8
9 # Find settings
10 firegex=".+\(aac\|aif\|aiff\|flac\|m4a\|m4b\|mkv\|mp3\|mp4\|ogg\|opus\|wav\)$"; # update according to `find . -type f | grep -Eo "\.([[:alnum:]])+$" | sort -u`
11 fsize="10k"; # default: minimum "10k"
12 fdepth="10"; # find depth
13
14 yell() { echo "$0: $*" >&2; } # print script path and all args to stderr
15 die() { yell "$*"; exit 111; } # same as yell() but non-zero exit status
16 must() { "$@" || die "cannot $*"; } # runs args as command, reports args if command fails
17 checkInt() {
18 # Desc: Checks if arg is integer
19 # Usage: checkInt arg
20 # Input: arg: integer
21 # Output: - return code 0 (if arg is integer)
22 # - return code 1 (if arg is not integer)
23 # Example: if ! checkInt $arg; then echo "not int"; fi;
24 # Version: 0.0.2
25 local returnState
26
27 #===Process Arg===
28 if [[ $# -ne 1 ]]; then
29 die "ERROR:Invalid number of arguments:$#";
30 fi;
31
32 RETEST1='^[0-9]+$'; # Regular Expression to test
33 if [[ ! "$1" =~ $RETEST1 ]] ; then
34 returnState="false";
35 else
36 returnState="true";
37 fi;
38
39 #===Determine function return code===
40 if [ "$returnState" = "true" ]; then
41 return 0;
42 else
43 return 1;
44 fi;
45 } # Checks if arg is integer
46 find_flist() {
47 # Desc: print file list to stdout via `find` using script parameters
48 # Input: arg1: path to dir
49 # var: fdepth find depth
50 # var: firegex pattern find iregex
51 # var: fsize find size
52 if [[ ! -d "$1" ]]; then return 1; fi;
53 must find "$1" -maxdepth "$fdepth" -type f -iregex "$firegex" -size +"$fsize";
54 }; # print file list to stdout from dir with script parameters
55 main() {
56 # Input: arg1: dir_in input dir
57 # arg2: cuda_num cuda GPU index
58 # var: fdepth (find_flist) find depth
59 # var: firegex (find_flist) pattern find iregex
60 # var: fsize (find_flist) find size
61 dir_in="$1";
62 cuda_num="$2";
63 if ! checkInt "$cuda_num"; then die "FATAL:No graphics card selected."; fi;
64 while read -r line; do
65 echo "STATUS:Processing:$line" 1>&2;
66 SECONDS=0;
67 dir_out="$(dirname "$line"; )";
68 ftmp="$line".tmp;
69 #declare -p line dir_out ftmp; # debug
70 if [[ ! -f "$ftmp" ]] && \
71 [[ ! -f "${line%.*}".srt ]] && \
72 [[ ! -f "${line%.*}".vtt ]] && \
73 [[ ! -f "${line%.*}".txt ]] && \
74 [[ ! -f "${line%.*}".tsv ]] && \
75 [[ ! -f "${line%.*}".json ]]; then
76 touch "$ftmp";
77 yell "STATUS:No conflicts detected.";
78 else
79 yell "STATUS:Skipping:$line";
80 continue;
81 fi;
82 whisper "$line" \
83 --model large-v3 \
84 --output_format all \
85 --output_dir "$dir_out" \
86 --language en \
87 --device cuda:"$cuda_num" && \
88 (
89 echo "STATUS:$SECONDS:Finished:$line" 1>&2;
90 rm "$ftmp"; # remove .tmp file
91 );
92 done < <(find_flist "$dir_in" | shuf);
93 }; # main program
94 export -f yell die must find_flist;
95
96 main "$@";
97
98