Alte Revision

Spielplatz place02

#! /usr/bin/env ruby
# encoding: UTF-8

# iolate.rb:
#
#   Version 0.01
#
# Extract and reinsert the translateable text from 
# the dokwiki-file cpage.dkw to the Textfile transfertext.utf8
#  
#  Copyright  18.11.2021
#  
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#  
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#  
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#  
 
# == syntax 
#
# iolate.rb [-i inputfilename] 
#           [-o throughputfilename] 
#           [-v] 
#           [-h] 
#
# -i 
#  name of the file in dokuwiki-syntax. If none is given, 
#  cpage.dkw is assumed
#
# -o 
#  Name of the text file which contains the text in dokuwiki-syntax.
#  If none is given transfertext.utf8 is assumed
#
# -k
#  Keep tags in the outputfile transfertext.utf8
#
# -v
#  verbose
#
# -h
#  show help
#

# == file formats
#
# cpage.dkw
# has to be formated according to https://comicslate.org/en/wiki/12balloons
# Only the text in text boxes between the the first tag pair
# {{cotan>...}} ... {{<cotan}} is extractet into transfertext.utf8,
# text boxes  before or after this are left unchanged
# The first ... here represents a Filename which must not contain }}  
# The second ... represents the string which is searched for translateable text
# which must not itself contain {{<cotan}}.
# Only text in textboxs is extraxted i.e. the line before the text must
# start with an @ wich is followed by at least four 
# comma seperated integeres, e.g. @37,25,419,112 
# After the text there has to be a line containing exactly one ~ character
# at the start of the line, marking the end of the texbox
# The following holds true für the translated file  cpage.dkw
# All <...> tags are removed from the result of the second run 
# All [...] tags which are not at thre start of the first line, 
#           are removed from the result of the second run
# All -. are removed
# All linefeeds and carriage returns are removed
# The removing of <...> and [...] tags can be suppressed with -k
# This file is than also used as output to store the data
# between passes in yaml format.  
#
# cpages.yaml
# stors all the data of the file cpages.dkw except for the text which 
# gas ti be translated
#
# transfertext.utf8
# In the extracted text all text from one textbox is concatenatet into 
# one line, seperated by a space. The text of all textboxes 
# is concatenated seperated by one LF character per box.
# This output file is used as input in the second pass.
# The translated text has to concur to the same specification
# in order to be used for the construction of cpages1.dkw
#
# cpages1.dkw
# The recreated file containing the markup from  cpage.dkw and the text
# from transfertext.utf8

# == usage in context
#
# 0. It is assumem that you have the curent version of 
#    the ruby interpreter installed as well as the gems 
#    listed under "dependecies". Copy this script into 
#    a directory on your computer. All further file handling
#    is assumed to take place there
# 
# 1. copy the content of the strip you want to translate 
#    or at least {{cotan>...}}, {{<cotan}} and what is between them 
#    into the file called cpage.dkw in the directory where
#    the executable file iolate.rb is located
#
# 2. execute, e.g call ruby iolate.rb
# 
# 3. open the file transfertext.utf8 and copy its contense 
#    into the input field of a translation service or programm
#    like Deepl.com or lingvanex.com
# 
# 4. translate
#
# 5. overcopy the text in transfertext.utf8 with 
#    the translation and save the file
# 
# 5. execute iolate.rb again
#
# 6. cpages1.dkw should now contain the translation
#

# == todo
#
# 1. Add the capabilitie to supply various translations
#    in transfertext.utf8 seperated by ~ and produce files
#     cpage1.dkw, cpage2.dkw etc. from them
#
# 2. optionally add a loop and REST-api combatibility 
#    to this script, to translate whole comics at once
#

# == constants
#
NEWLINECHAR = "\n" # EOL standard char to indicate end of line
SYSLIMITFILENAME = 255 # maximal length of a file name for ext4

# == dependecies

# class for parsing commandline options
begin
  require 'optparse' 
rescue Exception
  STDERR.puts "warnung: require of optparse failed." 
  # exit 1 # nonfatal if no options are given, so cross your fingers
end

# class for serialization
begin
  require 'yaml' 
rescue Exception
  STDERR.puts "warnung: require of yaml failed." 
  exit 1
end

# == methods
#
# to keep in line with a functional approch which might be easier 
# to port to javascript with opal, we make no use of oo,
# but use the methods of this object as functions
 
# find all tags, which start with the character sBeginTag
# and are closed with sEndTag, return the positions 
# of start and end of each tag, nil if no tag is found
def tagspos(sStr, sBeginTag="[", sEndTag="]")
  iaStarts = Array.new
  iaEnds = Array.new
  iBegin = sStr.index(sBeginTag)
  while not iBegin.nil?
     iaStarts << iBegin
     iEnd =  sStr.index(sEndTag, iBegin + 1)
     if iEnd.nil?
       iaStarts.pop
       break       
     else # complete tag found, search for next one
       iaEnds << iEnd
       iBegin = sStr.index(sBeginTag, sEndTag + 1)
     end
  end
  if iaStarts.empty?
    return nil 
  else
    return [iaStarts, iaEnds]
  end
end

# remove everything between sBeginChar 
# und sEndChar including those characters
# allover sStr and return sStr
def removebetween(sStr, sBeginChar="<", sEndChar=">")
  iBegin = sStr.index(sBeginChar)
  while not iBegin.nil?
     iEnd =  sStr.index(sEndChar, iBegin + 1)
     if iEnd.nil?
       return sStr 
     else
       sStr.slice!(iBegin, iEnd - iBegin + 1)
       iBegin = sStr.index(sBeginChar)
     end
  end
return sStr 
end

# slongString 
# 
def startandendofdstring(slongString, icutoff = 70)
  if slongString.is_a? String
    sretString = String.new slongString.to_s
  else
    sretString = String.new slongString.inspect.to_s 
  end
  l = sretString.length 
  if l <= icutoff * 2
    return sretString
  else
    return sretString[0 .. icutoff - 1] + " ... " + sretString[l - icutoff ... -1] 
  end
end

# returns true if a string represents an integer
def is_i?(sMayBeInteger)
  /\A[-+]?\d+\z/ === sMayBeInteger
end

# returns true, if the string sLine is the first line of a textbox
def isBoxStart(sLine)
  unless sLine.start_with? "@" 
    return false
  end
  asLine = sLine[1...-1].rstrip.split ","
  unless asLine.length > 3
    return false
  end
  unless is_i?(asLine[0])
   _return false
  end
  unless is_i?(asLine[1])
   _return false
  end
  unless is_i?(asLine[2])
   _return false
  end
  unless is_i?(asLine[3][0]) || is_i?(asLine[3][0..1])
    return false
  end
  return true
end

# == main
#
# for the use of option parser see http://www.dreamsyssoft.com/ruby-scripting-tutorial/optionparser-tutorial.php
# https://stelfox.net/blog/2012/12/rubys-option-parser-a-more-complete-example/
# http://ruby-doc.org/stdlib-1.9.3/libdoc/optparse/rdoc/OptionParser.html#method-i-make_switch
options = {:verbose => nil, :dokuinfilename => "cpage.dkw", :textoutputfilename => "transfertext.utf8", :fkeep => false} # default values go here
opt_parser = OptionParser.new do |opts|
  opts.banner = "Usage: ocr-latest-png.rb [-i dokuinfilename] [-o textoutputfilename] [-k] [-v] [-h]" 
  opts.on("-i filename", "--inputfile", "name of the dokuwiki file from which to extract translateable text. The default is cpages.dkw.") do |anopt|
    options[:dokuinfilename] = anopt
  end
  opts.on("-o filename", "--outputtext", "name of the text file to which translateable text is written. The default istransfertext.utf8.") do |anopt|
    options[:textoutputfilename] = anopt
  end
  opts.on("-k filename", "--keeptags", "do not atempt to delete all tags. Default is .") do |anopt|
    options[:fkeep] = anopt
  end
  opts.on("-v", "--[no-]verbose", "show comments") do |anopt| 
    options[:verbose] = anopt
  end  
  opts.on("-h", "--help", "show this help.") do
    puts opts
    exit
  end
end
begin
  opt_parser.parse!
rescue OptionParser::InvalidOption
  puts "\nunknown option"
  puts "in line" + __LINE__ # the current line number in the source file.
  puts $! # error message
  puts $@ # error position
  raise
end
# ??? eine nicht angegebene Option gibt den Wert nil
# ??? eine angegebene Option ohne Zeichen im folgenden Kommandozeilenargument gibt den Wert true
# set verbosity flag
if ! options[:verbose].nil?
  $ivc = 0
else
  $ivc = 1 
end
sDokuwikifile = "cpage.dkw" # default name
if options[:dokuinfilename].nil? # should be impossible, but well ...
  puts "after the -i option there needs to be a filename" if $ivc > 0
  exit 1
else
  sDokuwikifile = options[:dokuinfilename]
end
sTransferfile = "transfertext.utf8"
if options[:textoutputfilename].nil? # also impossible
  puts "after the -o option there needs to be a filename" if $ivc > 0
  exit 1
else
  sTransferfile = options[:textoutputfilename]
end
fkeep = options[:fkeep]
bInCotan = false # flag
bInTextbox = false # flag
iNrTextboxes = 0 # the total number of textboxes which were red
asPageAsLines = Array.new # the markup pieces of the file are collected here 
aiTextLineNumbersInPage = Array.new # the line numbers of the text box lines in the page
asTextLines = Array.new # the text lines from the text boxes, one element per textbox contains all the lines from this textbox
asTextboxTags = Array.new # the tags at the start of a text line
unless File.exists? sTransferfile # if true, we are pre translation, otherwise post translation
  file = File.open(sDokuwikifile, "r")
  aText = file.read.split "\n"
  file.close
  sTextBoxText = String.new # string to collect all the text from a textbox in
  aText.each_index { |iline|
    sline = aText[iline]
    if bInCotan # starting to search for textboxes
      if sline.include? "{{<cotan}}" # stop searching for textboxes, if you're out of the cotan block
        bInCotan = false 
        asPageAsLines << sline unless bInCotan # hand cotan block's tail through 
      else 
        if bInTextbox
	      if sline.strip == "~" # this is the way a textbox ends, not with a !, but with a ~
	        asTextLines << sTextBoxText # all the txt from the text box has been collected and can now be stored as one line
            sTextBoxText = String.new # reset to empty
	        asPageAsLines << "" # store an empty line, because the text goes to asTextLine
            asPageAsLines << sline #lines before and after a textbox, including the textbox's head and tail are just handed through 
	        bInTextbox = false
	      else 
	        sline.strip!
	        if sTextBoxText.empty? # this is the first non empty text box line
	          if sline.chr == "[" # text line starts with a tag
	  	        while sline.chr == "[" # a tag is still at the start of the line
		          iClose = sline.index "]"
		          if iClose == -1 # no closing parenthesis found
	                break # the rest of the line is supposed to contain the text
		          else
    	            asTextboxTags[iNrTextboxes - 1] += sline[0..iClose] # save the tag by appending it to previous tags in the same line
	               sline = sline[iClose + 1 .. -1] # remove the tag from the line
		          end
		        end # next tag 
              end # all starting tags removed
	        end # the starting tags in non first text box lines in textboxes are not saved
	        unless fkeep # remove other tags unless blocked
	          removebetween sline
	          removebetween sline, "[", "]"
	        end
	        if ! sTextBoxText.empty? # if the text has various lines concatenate them all sepereted by a blank
	          sTextBoxText += " "
	        end
	        sTextBoxText += sline + "\n"
	      end        
        else # not in a Textbox
          asPageAsLines << sline #lines before and after a textbox, including the textbox's head and tail are just handed through 
	      if isBoxStart(sline) # a textbox starts with this line, but also a mask
	        if ! (aText[iline].nil? || aText[iline + 1].chr == "#") # a textbox seems to be ahead
		    aiTextLineNumbersInPage << asPageAsLines.length - 1 # the line number in the output file where the text belongs.
	          asTextboxTags << "" # initialize tag memory for this textbox
	          iNrTextboxes += 1 # keep, count of the textboxes	        
	          bInTextbox = true
	        end
	      end
        end  
      end
    else 
      asPageAsLines << sline # lines before and after the cotan block, including the cotan block's head and tail are just handed through 
      bInCotan = sline[0..7] == "{{cotan>"
    end   
  } # next line of the dokuwiki file
  # collect all information for a second pass in an array 
  oSerializedData = Array.new
  oSerializedData << fkeep
  oSerializedData << iNrTextboxes
  oSerializedData << asPageAsLines 
  oSerializedData << aiTextLineNumbersInPage
  oSerializedData << asTextboxTags
  # oSerializedData << asTextLines
  # write intermediate file for second pass, using the original dokuwiki file name with a yaml extension
  iDotPos = sDokuwikifile.rindex "."
  iDotPos = sDokuwikifile.length if iDotPos.nil?
  sYamlFile = sDokuwikifile[0..iDotPos] + "yaml"
  File.open(sYamlFile, "w") do |file|
    file.puts YAML::dump(oSerializedData)
  end  
  # write textfile to be translated
  File.open(sTransferfile, "w") { |file|
    asTextLines.each { |s|
      file.write s
    }
  }  
else # post translation
  iDotPos = sDokuwikifile.rindex "."
  iDotPos = sDokuwikifile.length if iDotPos.nil?
  sYamlFile = sDokuwikifile[0..iDotPos] + "yaml"
  oSerializedData = Array.new
  oSerializedData = YAML.load_file(sYamlFile)
  fkeep = oSerializedData[0]
  iNrTextboxes = oSerializedData[1]
  asPageAsLines  = oSerializedData[2]
  aiTextLineNumbersInPage = oSerializedData[3]
  asTextboxTags = oSerializedData[4]
  # load translated text
  file = File.open(sTransferfile, "r")
    asTranslatedText = file.read.split "\n"
  file.close
  if asTranslatedText.length != iNrTextboxes 
    STDERR.puts "warning: the number of textboxes " + iNrTextboxes.to_a + " does not equal the number of translated strings " + asTranslatedText.length.to_a + "."  if $ivc > 0
  end
  # make translation
  sOutputtext = String.new
  # iTextBoxCur = 0
  asPageAsLines.each_index { |iline|
    iTextBoxCur = aiTextLineNumbersInPage.find_index(iline - 1) # lookup, if this line's number is in the table of translateable lines
    if iTextBoxCur.nil? # do we have a translation for this line
      sOutputtext = sOutputtext + asPageAsLines[iline] + "\n" # this line gets handed through unchanged   
    else # there is a translation for this line which we take from the sTransferfile
      sOutputtext = sOutputtext  + asTextboxTags[iTextBoxCur] + asTranslatedText[iTextBoxCur] + "\n"
      # iTextBoxCur += 1
    end
  }
  # remove trailing empty lines
  while sOutputtext[sOutputtext.length - 1] == "\n"
    sOutputtext.chomp!
  end
  # construct name of output file
  iDotPos = sDokuwikifile.rindex "."
  iDotPos = sDokuwikifile.length if iDotPos.nil?
  sOutputfile = sDokuwikifile[0..iDotPos] + "1" + sDokuwikifile[iDotPos+1..-1]
  # write final output
  File.open(sOutputfile, "w") do |file|
    file.puts sOutputtext
  end
  # File.delete sTransferfile # cleaning up
  # File.delete sYamlFile # cleaning up
end


__END__
The above end's the text interpreted as a ruby program.
You can access this and all following text using
the global IO object DATA, which contains all lines 
after the above marker