Alte Revision
Spielplatz place02
#! /usr/bin/env ruby # encoding: UTF-8 # iolate.rb: # # Version 0.01 # # Extract and reinsert the translateable text from # the dokwiki-file cpage.dkw to the Textfile transfertext.utf8 # # Copyright 18.11.2021 # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. # # == syntax # # iolate.rb [-i inputfilename] # [-o throughputfilename] # [-v] # [-h] # # -i # name of the file in dokuwiki-syntax. If none is given, # cpage.dkw is assumed # # -o # Name of the text file which contains the text in dokuwiki-syntax. # If none is given transfertext.utf8 is assumed # # -k # Keep tags in the outputfile transfertext.utf8 # # -v # verbose # # -h # show help # # == file formats # # cpage.dkw # has to be formated according to # Only the text in text boxes between the the first tag pair # {{cotan>...}} ... {{<cotan}} is extractet into transfertext.utf8, # text boxes before or after this are left unchanged # The first ... here represents a Filename which must not contain }} # The second ... represents the string which is searched for translateable text # which must not itself contain {{<cotan}}. # Only text in textboxs is extraxted i.e. the line before the text must # start with an @ wich is followed by at least four # comma seperated integeres, e.g. @37,25,419,112 # After the text there has to be a line containing exactly one ~ character # at the start of the line, marking the end of the texbox # The following holds true für the translated file cpage.dkw # All <...> tags are removed from the result of the second run # All [...] tags which are not at thre start of the first line, # are removed from the result of the second run # All -. are removed # All linefeeds and carriage returns are removed # The removing of <...> and [...] tags can be suppressed with -k # This file is than also used as output to store the data # between passes in yaml format. # # cpages.yaml # stors all the data of the file cpages.dkw except for the text which # gas ti be translated # # transfertext.utf8 # In the extracted text all text from one textbox is concatenatet into # one line, seperated by a space. The text of all textboxes # is concatenated seperated by one LF character per box. # This output file is used as input in the second pass. # The translated text has to concur to the same specification # in order to be used for the construction of cpages1.dkw # # cpages1.dkw # The recreated file containing the markup from cpage.dkw and the text # from transfertext.utf8 # == usage in context # # 0. It is assumem that you have the curent version of # the ruby interpreter installed as well as the gems # listed under "dependecies". Copy this script into # a directory on your computer. All further file handling # is assumed to take place there # # 1. copy the content of the strip you want to translate # or at least {{cotan>...}}, {{<cotan}} and what is between them # into the file called cpage.dkw in the directory where # the executable file iolate.rb is located # # 2. execute, e.g call ruby iolate.rb # # 3. open the file transfertext.utf8 and copy its contense # into the input field of a translation service or programm # like or # # 4. translate # # 5. overcopy the text in transfertext.utf8 with # the translation and save the file # # 5. execute iolate.rb again # # 6. cpages1.dkw should now contain the translation # # == todo # # 1. Add the capabilitie to supply various translations # in transfertext.utf8 seperated by ~ and produce files # cpage1.dkw, cpage2.dkw etc. from them # # 2. optionally add a loop and REST-api combatibility # to this script, to translate whole comics at once # # == constants # NEWLINECHAR = "\n" # EOL standard char to indicate end of line SYSLIMITFILENAME = 255 # maximal length of a file name for ext4 # == dependecies # class for parsing commandline options begin require 'optparse' rescue Exception STDERR.puts "warnung: require of optparse failed." # exit 1 # nonfatal if no options are given, so cross your fingers end # class for serialization begin require 'yaml' rescue Exception STDERR.puts "warnung: require of yaml failed." exit 1 end # == methods # # to keep in line with a functional approch which might be easier # to port to javascript with opal, we make no use of oo, # but use the methods of this object as functions # find all tags, which start with the character sBeginTag # and are closed with sEndTag, return the positions # of start and end of each tag, nil if no tag is found def tagspos(sStr, sBeginTag="[", sEndTag="]") iaStarts = iaEnds = iBegin = sStr.index(sBeginTag) while not iBegin.nil? iaStarts << iBegin iEnd = sStr.index(sEndTag, iBegin + 1) if iEnd.nil? iaStarts.pop break else # complete tag found, search for next one iaEnds << iEnd iBegin = sStr.index(sBeginTag, sEndTag + 1) end end if iaStarts.empty? return nil else return [iaStarts, iaEnds] end end # remove everything between sBeginChar # und sEndChar including those characters # allover sStr and return sStr def removebetween(sStr, sBeginChar="<", sEndChar=">") iBegin = sStr.index(sBeginChar) while not iBegin.nil? iEnd = sStr.index(sEndChar, iBegin + 1) if iEnd.nil? return sStr else sStr.slice!(iBegin, iEnd - iBegin + 1) iBegin = sStr.index(sBeginChar) end end return sStr end # slongString # def startandendofdstring(slongString, icutoff = 70) if slongString.is_a? String sretString = slongString.to_s else sretString = slongString.inspect.to_s end l = sretString.length if l <= icutoff * 2 return sretString else return sretString[0 .. icutoff - 1] + " ... " + sretString[l - icutoff ... -1] end end # returns true if a string represents an integer def is_i?(sMayBeInteger) /\A[-+]?\d+\z/ === sMayBeInteger end # returns true, if the string sLine is the first line of a textbox def isBoxStart(sLine) unless sLine.start_with? "@" return false end asLine = sLine[1...-1].rstrip.split "," unless asLine.length > 3 return false end unless is_i?(asLine[0]) _return false end unless is_i?(asLine[1]) _return false end unless is_i?(asLine[2]) _return false end unless is_i?(asLine[3][0]) || is_i?(asLine[3][0..1]) return false end return true end # == main # # for the use of option parser see # # options = {:verbose => nil, :dokuinfilename => "cpage.dkw", :textoutputfilename => "transfertext.utf8", :fkeep => false} # default values go here opt_parser = do |opts| opts.banner = "Usage: ocr-latest-png.rb [-i dokuinfilename] [-o textoutputfilename] [-k] [-v] [-h]" opts.on("-i filename", "--inputfile", "name of the dokuwiki file from which to extract translateable text. The default is cpages.dkw.") do |anopt| options[:dokuinfilename] = anopt end opts.on("-o filename", "--outputtext", "name of the text file to which translateable text is written. The default istransfertext.utf8.") do |anopt| options[:textoutputfilename] = anopt end opts.on("-k filename", "--keeptags", "do not atempt to delete all tags. Default is .") do |anopt| options[:fkeep] = anopt end opts.on("-v", "--[no-]verbose", "show comments") do |anopt| options[:verbose] = anopt end opts.on("-h", "--help", "show this help.") do puts opts exit end end begin opt_parser.parse! rescue OptionParser::InvalidOption puts "\nunknown option" puts "in line" + __LINE__ # the current line number in the source file. puts $! # error message puts $@ # error position raise end # ??? eine nicht angegebene Option gibt den Wert nil # ??? eine angegebene Option ohne Zeichen im folgenden Kommandozeilenargument gibt den Wert true # set verbosity flag if ! options[:verbose].nil? $ivc = 0 else $ivc = 1 end sDokuwikifile = "cpage.dkw" # default name if options[:dokuinfilename].nil? # should be impossible, but well ... puts "after the -i option there needs to be a filename" if $ivc > 0 exit 1 else sDokuwikifile = options[:dokuinfilename] end sTransferfile = "transfertext.utf8" if options[:textoutputfilename].nil? # also impossible puts "after the -o option there needs to be a filename" if $ivc > 0 exit 1 else sTransferfile = options[:textoutputfilename] end fkeep = options[:fkeep] bInCotan = false # flag bInTextbox = false # flag iNrTextboxes = 0 # the total number of textboxes which were red asPageAsLines = # the markup pieces of the file are collected here aiTextLineNumbersInPage = # the line numbers of the text box lines in the page asTextLines = # the text lines from the text boxes, one element per textbox contains all the lines from this textbox asTextboxTags = # the tags at the start of a text line unless File.exists? sTransferfile # if true, we are pre translation, otherwise post translation file =, "r") aText = "\n" file.close sTextBoxText = # string to collect all the text from a textbox in aText.each_index { |iline| sline = aText[iline] if bInCotan # starting to search for textboxes if sline.include? "{{<cotan}}" # stop searching for textboxes, if you're out of the cotan block bInCotan = false asPageAsLines << sline unless bInCotan # hand cotan block's tail through else if bInTextbox if sline.strip == "~" # this is the way a textbox ends, not with a !, but with a ~ asTextLines << sTextBoxText # all the txt from the text box has been collected and can now be stored as one line sTextBoxText = # reset to empty asPageAsLines << "" # store an empty line, because the text goes to asTextLine asPageAsLines << sline #lines before and after a textbox, including the textbox's head and tail are just handed through bInTextbox = false else sline.strip! if sTextBoxText.empty? # this is the first non empty text box line if sline.chr == "[" # text line starts with a tag while sline.chr == "[" # a tag is still at the start of the line iClose = sline.index "]" if iClose == -1 # no closing parenthesis found break # the rest of the line is supposed to contain the text else asTextboxTags[iNrTextboxes - 1] += sline[0..iClose] # save the tag by appending it to previous tags in the same line sline = sline[iClose + 1 .. -1] # remove the tag from the line end end # next tag end # all starting tags removed end # the starting tags in non first text box lines in textboxes are not saved unless fkeep # remove other tags unless blocked removebetween sline removebetween sline, "[", "]" end if ! sTextBoxText.empty? # if the text has various lines concatenate them all sepereted by a blank sTextBoxText += " " end sTextBoxText += sline + "\n" end else # not in a Textbox asPageAsLines << sline #lines before and after a textbox, including the textbox's head and tail are just handed through if isBoxStart(sline) # a textbox starts with this line, but also a mask if ! (aText[iline].nil? || aText[iline + 1].chr == "#") # a textbox seems to be ahead aiTextLineNumbersInPage << asPageAsLines.length - 1 # the line number in the output file where the text belongs. asTextboxTags << "" # initialize tag memory for this textbox iNrTextboxes += 1 # keep, count of the textboxes bInTextbox = true end end end end else asPageAsLines << sline # lines before and after the cotan block, including the cotan block's head and tail are just handed through bInCotan = sline[0..7] == "{{cotan>" end } # next line of the dokuwiki file # collect all information for a second pass in an array oSerializedData = oSerializedData << fkeep oSerializedData << iNrTextboxes oSerializedData << asPageAsLines oSerializedData << aiTextLineNumbersInPage oSerializedData << asTextboxTags # oSerializedData << asTextLines # write intermediate file for second pass, using the original dokuwiki file name with a yaml extension iDotPos = sDokuwikifile.rindex "." iDotPos = sDokuwikifile.length if iDotPos.nil? sYamlFile = sDokuwikifile[0..iDotPos] + "yaml", "w") do |file| file.puts YAML::dump(oSerializedData) end # write textfile to be translated, "w") { |file| asTextLines.each { |s| file.write s } } else # post translation iDotPos = sDokuwikifile.rindex "." iDotPos = sDokuwikifile.length if iDotPos.nil? sYamlFile = sDokuwikifile[0..iDotPos] + "yaml" oSerializedData = oSerializedData = YAML.load_file(sYamlFile) fkeep = oSerializedData[0] iNrTextboxes = oSerializedData[1] asPageAsLines = oSerializedData[2] aiTextLineNumbersInPage = oSerializedData[3] asTextboxTags = oSerializedData[4] # load translated text file =, "r") asTranslatedText = "\n" file.close if asTranslatedText.length != iNrTextboxes STDERR.puts "warning: the number of textboxes " + iNrTextboxes.to_a + " does not equal the number of translated strings " + asTranslatedText.length.to_a + "." if $ivc > 0 end # make translation sOutputtext = # iTextBoxCur = 0 asPageAsLines.each_index { |iline| iTextBoxCur = aiTextLineNumbersInPage.find_index(iline - 1) # lookup, if this line's number is in the table of translateable lines if iTextBoxCur.nil? # do we have a translation for this line sOutputtext = sOutputtext + asPageAsLines[iline] + "\n" # this line gets handed through unchanged else # there is a translation for this line which we take from the sTransferfile sOutputtext = sOutputtext + asTextboxTags[iTextBoxCur] + asTranslatedText[iTextBoxCur] + "\n" # iTextBoxCur += 1 end } # remove trailing empty lines while sOutputtext[sOutputtext.length - 1] == "\n" sOutputtext.chomp! end # construct name of output file iDotPos = sDokuwikifile.rindex "." iDotPos = sDokuwikifile.length if iDotPos.nil? sOutputfile = sDokuwikifile[0..iDotPos] + "1" + sDokuwikifile[iDotPos+1..-1] # write final output, "w") do |file| file.puts sOutputtext end # File.delete sTransferfile # cleaning up # File.delete sYamlFile # cleaning up end __END__ The above end's the text interpreted as a ruby program. You can access this and all following text using the global IO object DATA, which contains all lines after the above marker