| 1 | #!/bin/bash |
| 2 | # Desc: Convert wikicode to subpages |
| 3 | # Usage: mw_wc2sp.sh [path file] |
| 4 | # Input: arg1 path input wikicode file |
| 5 | # Output: files wikicode file tree |
| 6 | # Depends: Bash 5.1.16, GNU Coreutils 8.32 |
| 7 | # Version: 0.0.1 |
| 8 | |
| 9 | re_sp='^(<!-- @subpage:)(.*)([ ]*-->)$'; # subpage marker pattern |
| 10 | d_out=./wikicode/; # default output dir |
| 11 | f_spl="subpage_list.txt"; |
| 12 | p_spl="${d_out}/${f_spl}"; |
| 13 | f_splv="subpage_list_validated.txt"; |
| 14 | p_splv="${d_out}/${f_splv}"; |
| 15 | |
| 16 | yell() { echo "$0: $*" >&2; } # print script path and all args to stderr |
| 17 | die() { yell "$*"; exit 111; } # same as yell() but non-zero exit status |
| 18 | must() { "$@" || die "cannot $*"; } # runs args as command, reports args if command fails |
| 19 | get_path_fork_level() { |
| 20 | # Desc: Get fork level from two paths |
| 21 | # Input: arg1 str path |
| 22 | # arg2 str path |
| 23 | # Output: stdout int fork level |
| 24 | # Version: 0.0.1 |
| 25 | local path1="$1"; |
| 26 | local path2="$2"; |
| 27 | |
| 28 | # Squeeze multiple slashes and remove trailing slashes |
| 29 | path1="$(echo "$path1" | tr -s '/' | sed 's:/*$::' )"; |
| 30 | path2="$(echo "$path2" | tr -s '/' | sed 's:/*$::' )"; |
| 31 | |
| 32 | # Check for mixed absolute/relative paths |
| 33 | if [[ "$path1" =~ ^/ ]] && [[ "$path2" =~ ^/ ]]; then |
| 34 | flag_root=true; |
| 35 | # Remove initial / |
| 36 | path1="$(echo "$path1" | sed -e 's:^/::' )"; |
| 37 | path2="$(echo "$path2" | sed -e 's:^/::' )"; |
| 38 | elif [[ ! "$path1" =~ ^/ ]] && [[ ! "$path2" =~ ^/ ]]; then |
| 39 | flag_root=false; |
| 40 | else |
| 41 | declare -p path1 path2 flag_root; |
| 42 | echo "FATAL:Mixed relative and absolute paths not supported." 1>&2; |
| 43 | return 1; |
| 44 | fi; |
| 45 | |
| 46 | # Save path as arrays with `/` as element delimiter |
| 47 | local IFS='/'; |
| 48 | read -ra parts1 <<< "$path1"; |
| 49 | read -ra parts2 <<< "$path2"; |
| 50 | |
| 51 | # Get fork level by counting identical path elements from rootside |
| 52 | local fork_level=0; |
| 53 | for (( i=0; i<${#parts1[@]} && i<${#parts2[@]}; i++ )); do |
| 54 | if [[ "${parts1[i]}" != "${parts2[i]}" ]]; then break; fi; |
| 55 | ((fork_level++)); |
| 56 | done; |
| 57 | |
| 58 | echo "$fork_level"; |
| 59 | #declare -p path1 path2 flag_root parts1 parts2 fork_level; # debug |
| 60 | return 0; |
| 61 | }; # Get fork level int from two paths |
| 62 | prune_path_rootside() { |
| 63 | # Desc: Prunes a path from the root-side to a specified prune level. |
| 64 | # Input: arg1 str path |
| 65 | # arg2 int prune level (0-indexed) |
| 66 | # Depends: GNU sed 4.8 |
| 67 | # Version: 0.0.1 |
| 68 | local path="$1"; |
| 69 | local prune_level="$2"; |
| 70 | |
| 71 | # Check for absolute or relative path |
| 72 | if [[ "$path" =~ ^/ ]]; then |
| 73 | flag_root=true; |
| 74 | # Remove initial / |
| 75 | path="$(echo "$path" | sed -e 's:^/::' )"; |
| 76 | else |
| 77 | flag_root=false; |
| 78 | fi; |
| 79 | |
| 80 | # Save path as array with `/` as element delimiter |
| 81 | local IFS='/'; |
| 82 | read -ra parts <<< "$path"; |
| 83 | |
| 84 | # Assemble pruned path from prune_level |
| 85 | local pruned_path=""; |
| 86 | for (( i=prune_level; i<${#parts[@]}; i++ )); do |
| 87 | pruned_path+="${parts[i]}/"; |
| 88 | done; |
| 89 | |
| 90 | # Trim trailing `/` delimiter |
| 91 | pruned_path=$(echo "$pruned_path" | sed 's:/*$::'); |
| 92 | |
| 93 | # Restore initial / if appropriate |
| 94 | if [[ "$flag_root" == "true" ]] && [[ "$prune_level" -eq 0 ]]; then |
| 95 | pruned_path=/"$pruned_path"; |
| 96 | fi; |
| 97 | |
| 98 | # Output pruned path |
| 99 | echo "$pruned_path"; |
| 100 | #declare -p path prune_level parts pruned_path && printf "========\n"; # debug |
| 101 | return 0; |
| 102 | }; # prune path rootside to int specified level |
| 103 | get_path_hierarchy_level() { |
| 104 | # Desc: Outputs hierarchy level of input paths |
| 105 | # Example: $ cat lines.txt | get_path_hierarchy_level |
| 106 | # Input: stdin str lines with /-delimited paths |
| 107 | # Output: stdout int hierarchy level of each path |
| 108 | # Version: 0.0.1 |
| 109 | |
| 110 | local line level; |
| 111 | local flag_root; |
| 112 | local -a output; |
| 113 | |
| 114 | n=0; |
| 115 | while read -r line; do |
| 116 | # Check for mixed absolute/relative paths. |
| 117 | if [[ $n -le 0 ]] && [[ "$line" =~ ^/ ]]; then |
| 118 | flag_root=true; |
| 119 | else |
| 120 | flag_root=false; |
| 121 | fi; |
| 122 | if { [[ "$flag_root" == "true" ]] && [[ ! "$line" =~ ^/ ]]; } || \ |
| 123 | { [[ "$flag_root" == "false" ]] && [[ "$line" =~ ^/ ]]; } then |
| 124 | echo "FATAL:Mixed relative and absolute paths not supported." 1>&2; return 1; |
| 125 | fi; |
| 126 | |
| 127 | # Squeeze multiple slashes and remove trailing slashes |
| 128 | line="$(echo "$line" | tr -s '/' | sed 's:/*$::' )"; |
| 129 | |
| 130 | # Count the number of slashes to determine hierarchy level |
| 131 | level="$(echo "$line" | awk -F'/' '{print NF-1}' )"; |
| 132 | if [[ "$flag_root" == "true" ]]; then ((level--)); fi; |
| 133 | |
| 134 | # Append to output |
| 135 | output+=("$level"); |
| 136 | #declare -p flag_root level; # debug |
| 137 | ((n++)); |
| 138 | done; |
| 139 | # Print output |
| 140 | printf "%s\n" "${output[@]}"; |
| 141 | }; # return hierarchy level of lines as integers |
| 142 | validate_subpage_list() { |
| 143 | # Desc: Check for illegal characters in subpage titles |
| 144 | # Input: stdin unvalidated subpage list |
| 145 | # Output: stdout validated subpage list |
| 146 | # Depends: BK-2020-03 read_stdin(), yell(), die() |
| 147 | # GNU sed v4.8 |
| 148 | while read -r line; do |
| 149 | |
| 150 | # Reject chars illegal in Mediawiki page titles. |
| 151 | re_illegal='[][><|}{#_]'; # match illegal page names chars #, <, >, [, ], _, {, |, } |
| 152 | if [[ "$line" =~ $re_illegal ]]; then |
| 153 | die "FATAL:Illegal char. Not allowed: #, <, >, [, ], _, {, |, }:$line"; |
| 154 | fi; |
| 155 | |
| 156 | # Reject trailing spaces. |
| 157 | re_ts=' $'; # match trailing space |
| 158 | if [[ "$line" =~ $re_ts ]]; then |
| 159 | die "FATAL:Trailing spaces not allowed:$line"; |
| 160 | fi; |
| 161 | |
| 162 | # Replace some chars with HTML-style codes |
| 163 | ## replace ampersand & with & # must be first |
| 164 | ## replace double quote " with " |
| 165 | ## replace single quote ' with ' |
| 166 | line="$(sed \ |
| 167 | -e 's/&/\&/g' \ |
| 168 | -e 's/"/\"/g' \ |
| 169 | -e "s/'/\'/g" \ |
| 170 | <<< "$line" )" || { echo "FATAL:Error running sed."; }; |
| 171 | printf "%s\n" "$line"; |
| 172 | done || { |
| 173 | echo "FATAL:Error reading stdin." 1>&2; return 1; }; |
| 174 | }; |
| 175 | check_input() { |
| 176 | local path_in="$1"; |
| 177 | if [[ ! -f "$path_in" ]]; then die "FATAL:Not a file path:$1"; fi; |
| 178 | }; # check input |
| 179 | assemble_subpage_ftree() { |
| 180 | # Desc: Identify subpage markers in input wikicode file to create |
| 181 | # subpage list and subpage content files |
| 182 | # Input: var fp_in path input file |
| 183 | # var re_sp regex for identifying subpage markers |
| 184 | # var d_out path directory for output |
| 185 | # var p_spl path subpage list file |
| 186 | #declare -p re_sp d_out f_spl p_spl fp_in; # debug |
| 187 | |
| 188 | yell "STATUS:Running assemble_subpage_ftree()."; # debug |
| 189 | |
| 190 | spc_path="${d_out}/presubpage.content"; # default destination for content before subpage detected |
| 191 | |
| 192 | ## Process input line-by-line |
| 193 | while read -r line; do |
| 194 | #declare -p line re_sp; # debug |
| 195 | ### Check for subpage marker |
| 196 | if [[ "$line" =~ $re_sp ]]; then |
| 197 | #### Identify new subpage path |
| 198 | sp_path="$(echo "$line" | sed -E -e "s/${re_sp}/\2/" -e 's/[ ]*$//'; )"; |
| 199 | # declare -p sp_path; # debug |
| 200 | #### Update subpage content file path |
| 201 | spc_path="${d_out}/${sp_path}.content"; |
| 202 | spc_dir="$(dirname "$spc_path"; )"; |
| 203 | #declare -p spc_path spc_dir; # debug |
| 204 | #### Prepare file destination |
| 205 | if [[ ! -d "$spc_dir" ]]; then |
| 206 | must mkdir -p "$spc_dir" && \ |
| 207 | yell "STATUS:Created dir:${spc_dir}"; |
| 208 | fi; |
| 209 | if [[ -f "$spc_path" ]]; then |
| 210 | die "FATAL:File already exists:${spc_path}"; |
| 211 | else |
| 212 | must touch "$spc_path"; |
| 213 | fi; |
| 214 | #### Append subpage path to subpage list |
| 215 | printf "%s\n" "$sp_path" >> "$p_spl"; |
| 216 | fi; |
| 217 | ### Write subpage content |
| 218 | must printf "%s\n" "$line" >> "$spc_path"; |
| 219 | done < "${fp_in}"; |
| 220 | |
| 221 | yell "STATUS:Finished assemble_subpage_ftree()."; # debug |
| 222 | }; # process input wikicode into subpage content files and subpage list |
| 223 | create_output_wikicode() { |
| 224 | # Desc: Use subpage list and subpage content files to create |
| 225 | # output subpage wikicode. |
| 226 | # Input: var p_spl path subpage list file |
| 227 | # var p_splv path subpage list file (validated) |
| 228 | # file ${p_spl} subpage list file |
| 229 | # file ${p_splv} subpage list file (validated) |
| 230 | # var d_out path directory for output |
| 231 | # Depends: get_path_fork_level() |
| 232 | # prune_path_rootside() |
| 233 | # get_path_hierarchy_level() |
| 234 | # validate_subpage_list() |
| 235 | # Output: files subpages in $d_out |
| 236 | |
| 237 | yell "Running create_output_wikicode()."; # debug |
| 238 | |
| 239 | # Read subpage list files into arrays. |
| 240 | local -a lines_spl lines_splv; |
| 241 | mapfile -t lines_spl < "$p_spl"; |
| 242 | mapfile -t lines_splv < "$p_splv"; |
| 243 | ## Add extra blank lines for couple line comparisons |
| 244 | lines_spl+=(''); |
| 245 | lines_splv+=(''); |
| 246 | declare -p lines_spl; # debug |
| 247 | |
| 248 | # Check that subpage list files have same line counts |
| 249 | lc_spl="${#lines_spl[@]}"; |
| 250 | lc_splv="${#lines_splv[@]}"; |
| 251 | if [[ ! "$lc_spl" -eq "$lc_splv" ]]; then |
| 252 | die "FATAL:Different line counts for subpage lists:$(declare -p lc_spl lc_splv;)"; |
| 253 | fi; |
| 254 | declare -p lc_spl lc_splv; # debug |
| 255 | |
| 256 | # Read content files according to subpage list file |
| 257 | # Note: $i corresponds to “next” line ($lnext). Therefore, use |
| 258 | # $((i-1)) to access the “current” ($lcurr) line. This offset is |
| 259 | # because subpage list lines are compared using lagging line |
| 260 | # comparison. |
| 261 | for i in "${!lines_spl[@]}"; do |
| 262 | declare -p i; # debug; |
| 263 | |
| 264 | # Check subpage content files |
| 265 | f_spc="${lines_spl[i-1]}.content"; |
| 266 | p_spc="${d_out}/${f_spc}"; |
| 267 | declare -p f_spc p_spc; |
| 268 | ## Exit if subpage content file missing |
| 269 | if [[ "$i" -gt 0 ]] && [[ ! -f "$p_spc" ]]; then |
| 270 | die "FATAL:Subpage content file missing:$p_spc"; fi; |
| 271 | |
| 272 | # Prepare output subpage wikicode files |
| 273 | f_spwc="${lines_splv[i-1]}.wc"; |
| 274 | p_spwc="${d_out}/${f_spwc}"; # use validated subpage name |
| 275 | declare -p f_spwc p_spwc; # debug |
| 276 | if [[ "$i" -gt 0 ]]; then must touch "$p_spwc"; fi; |
| 277 | |
| 278 | # Advance input lines |
| 279 | lprev="$lcurr"; |
| 280 | lcurr="$lnext"; |
| 281 | lnext="${lines_splv[i]}"; |
| 282 | declare -p lprev lcurr lnext; # debug |
| 283 | |
| 284 | # Update hierarchy tracker states |
| 285 | lprev_hier="$lcurr_hier"; |
| 286 | lcurr_hier="$lnext_hier"; |
| 287 | lnext_hier="$(echo "$lnext" | get_path_hierarchy_level)"; |
| 288 | |
| 289 | # Skip first iteration |
| 290 | if [[ "$i" -eq 0 ]]; then |
| 291 | yell "$i:DEBUG:Skipping first iteration."; # debug |
| 292 | printf -- "----\n" 1>&2; # debug |
| 293 | continue; fi; |
| 294 | |
| 295 | # Get path fork levels |
| 296 | fork_level_next="$(get_path_fork_level "$lcurr" "$lnext")"; |
| 297 | fork_level_prev="$(get_path_fork_level "$lcurr" "$lprev")"; |
| 298 | |
| 299 | # Count relative ups needed (`../`) |
| 300 | relups_next="$((lcurr_hier - fork_level_next + 1))"; |
| 301 | relups_prev="$((lcurr_hier - fork_level_prev + 1))"; |
| 302 | |
| 303 | # Initialize Next and Prev links with relative ups to fork. |
| 304 | link_next=""; |
| 305 | for (( j=0; j<relups_next; j++ )); do link_next+="../"; done; |
| 306 | if [[ "$relups_next" -eq 0 ]]; then link_next+="/"; fi; # handle new subpage path dive |
| 307 | link_prev=""; |
| 308 | for (( j=0; j<relups_prev; j++ )); do link_prev+="../"; done; |
| 309 | |
| 310 | # Append branchs from fork to Next and Prev targets |
| 311 | link_next+="$(prune_path_rootside "$lnext" "$fork_level_next")"; |
| 312 | link_prev+="$(prune_path_rootside "$lprev" "$fork_level_prev")"; |
| 313 | |
| 314 | # Print navigation link wikicode |
| 315 | if [[ -z "$lprev" ]]; then |
| 316 | printf "[[%s|Next]], [[../|Up]]\n" "$link_next" >> "$p_spwc"; |
| 317 | elif [[ -n "$lnext" ]]; then |
| 318 | printf "[[%s|Next]], [[%s|Previous]], [[../|Up]]\n" "$link_next" "$link_prev" >> "$p_spwc"; |
| 319 | elif [[ -z "$lnext" ]]; then |
| 320 | printf "[[%s|Previous]], [[../|Up]]\n" "$link_prev" >> "$p_spwc"; |
| 321 | else |
| 322 | yell "FATAL:Here be dragons."; |
| 323 | fi; |
| 324 | |
| 325 | # Print subpage content |
| 326 | printf -- "\n----<onlyinclude>\n" >> "$p_spwc"; |
| 327 | cat "$p_spc" >> "$p_spwc"; |
| 328 | printf -- "\n</onlyinclude>----\n" >> "$p_spwc"; |
| 329 | printf -- "\n==References==\n<references />\n" >> "$p_spwc"; |
| 330 | printf -- "\n==Footnotes==\n<references group=fn />\n" >> "$p_spwc"; |
| 331 | printf -- "\n==Comments==\n<references group=cmt />\n" >> "$p_spwc"; |
| 332 | printf -- "\n"; >> "$p_spwc"; |
| 333 | |
| 334 | declare -p i lprev lcurr lnext lprev_hier lcurr_hier lnext_hier; # debug |
| 335 | declare -p fork_level_next fork_level_prev relups_next relups_prev; # debug |
| 336 | declare -p link_next link_prev; # debug |
| 337 | printf "====================\n" # debug |
| 338 | done; |
| 339 | |
| 340 | yell "STATUS:Finished create_output_wikicode()."; # debug |
| 341 | }; # generate output subpage wikicode |
| 342 | main() { |
| 343 | check_input "$@"; |
| 344 | declare -g fp_in="$1"; # input file path |
| 345 | assemble_subpage_ftree; |
| 346 | validate_subpage_list < "$p_spl" > "$p_splv"; |
| 347 | create_output_wikicode; |
| 348 | }; # main program |
| 349 | |
| 350 | main "$@"; |
| 351 | |
| 352 | # Author: Steven Baltakatei Sandoval |
| 353 | # License: GPLv3+ |
| 354 | |
| 355 | |
| 356 | # Example input: |
| 357 | # ``` |
| 358 | # <!-- @subpage:Introduction --> |
| 359 | # This is an introducton. |
| 360 | # <!-- @subpage:Foreword --> |
| 361 | # This is a foreword. |
| 362 | |
| 363 | # <!-- @subpage:Part 1/Chapter 1 --> |
| 364 | # Blah. |
| 365 | # <!-- @subpage:Part 1/Chapter 2 --> |
| 366 | # Blah. |
| 367 | # <!-- @subpage:Part 1/Chapter 2/Section A --> |
| 368 | # Blabbity blah. |
| 369 | # <!-- @subpage:Part 2/ --> |
| 370 | # Blah. |
| 371 | # <!-- @subpage:Part 2/Chapter 1 --> |
| 372 | # More blah. |
| 373 | # ``` |