Index: svgscripts/test_positional_word_part.py
===================================================================
--- svgscripts/test_positional_word_part.py	(revision 5)
+++ svgscripts/test_positional_word_part.py	(revision 6)
@@ -1,89 +1,93 @@
 import unittest
 from os import sep, path
 from os.path import isdir, dirname, basename
 import lxml.etree as ET
 
 from datatypes.positional_word_part import PositionalWordPart
 from datatypes.page import Page
 
 
 class TestPositionalWordPart(unittest.TestCase):
     def setUp(self):
         DATADIR = dirname(__file__) + sep + 'test_data'
         if not isdir(DATADIR):
             DATADIR = dirname(dirname(__file__)) + sep + 'test_data'
         self.test_svg_file = DATADIR + sep + 'path_svg.svg' 
         self.test_xml = DATADIR + sep + 'W_I_8_page125.xml'
         self.word_part_objs = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }]
         x = 0
         for dict in self.word_part_objs:
             dict['class'] = 'st15'
             dict['x'] =  x
             dict['y'] = 11
             x += 1
 
     def test_init(self):
         pwp = PositionalWordPart(text='test')
         self.assertEqual(pwp.text, 'test')
         
     def test_attach_object_to_tree(self):
         pwp = PositionalWordPart(text='test', symbol_id='glyph-32-1', style_class='st1 st2 st3')
         empty_tree = ET.ElementTree(ET.Element('page'))
         pwp.attach_object_to_tree(empty_tree)
         for node in empty_tree.getroot().xpath('//' + pwp.tag):
             self.assertEqual(node.get('id'), '0')
             self.assertEqual(node.get('symbol-id'), 'glyph-32-1')
 
     def test_init_node(self):
         pwp = PositionalWordPart(text='test', symbol_id='glyph-32-1', style_class='st1 st2 st3')
         empty_tree = ET.ElementTree(ET.Element('page'))
         pwp.attach_object_to_tree(empty_tree)
         pwp2 = PositionalWordPart(node=empty_tree.getroot().find('./' + pwp.tag))
         self.assertEqual(pwp2.id, pwp.id)
         self.assertEqual(pwp2.text, pwp.text)
 
     def test_CREATE_POSITIONAL_WORD_PART(self):
         svg_tree = ET.parse(self.test_svg_file)
         namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
         xmin = 311.8125
         ymin = 158.0117
         text = 'es'
         style_class = 'st5 st6'
         x = 258.148
         y = 8.5
         svg_x = x + xmin
         svg_y = y + ymin
         use_nodes = svg_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\
                     .format(svg_x-.1, svg_x+.1,svg_y-0.1, svg_y+.1), namespaces=namespaces)
         self.assertEqual(len(use_nodes), 1)
         pwp = PositionalWordPart.CREATE_POSITIONAL_WORD_PART(text[0], use_nodes[0], namespaces, xmin=xmin, ymin=ymin, style_class=style_class)
         self.assertEqual(pwp.height, 3.672)
         self.assertEqual(pwp.width, 2.594)
 
     def test_CREATE_POSITIONAL_WORD_PART_LIST(self):
         page = Page(xml_source_file=self.test_xml)
         svg_tree = ET.parse(self.test_svg_file)
         namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() }
         xmin = 311.8125
         ymin = 158.0117
         text = 'es'
         style_class = 'st5 st6'
         x = 258.148
         y = 8.5
         word_part_obj = { 'text': text, 'x': x, 'y': y, 'matrix': None, 'class': style_class } 
         pwp_list = PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(word_part_obj, svg_tree, namespaces, page, xmin=xmin, ymin=ymin)
-        self.assertEqual(len(pwp_list), 2)
+        self.assertEqual(len(pwp_list), len(text))
         self.assertEqual(pwp_list[0].height, 3.672)
         self.assertEqual(pwp_list[0].width, 2.594)
+        text = 'ergleicher'
+        word_part_obj = { 'text': text, 'x': 174.619, 'y': 189.6, 'matrix': None, 'class': style_class } 
+        pwp_list = PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(word_part_obj, svg_tree, namespaces, page, xmin=xmin, ymin=ymin)
+        self.assertEqual(len(pwp_list), len(text))
 
     def test_CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(self):
         page = Page(xml_source_file=self.test_xml)
         pwps = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, self.word_part_objs)
         self.assertEqual(len(pwps), 3)
         self.assertEqual(pwps[0].text, 'a')
         self.assertEqual(pwps[0].style_class, 'st15')
         self.assertEqual(pwps[0].width, 0.8)
         self.assertEqual(pwps[2].width, 3.85)
 
 if __name__ == "__main__":
     unittest.main()
Index: svgscripts/datatypes/positional_word_part.py
===================================================================
--- svgscripts/datatypes/positional_word_part.py	(revision 5)
+++ svgscripts/datatypes/positional_word_part.py	(revision 6)
@@ -1,152 +1,152 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 """   This class can be used to represent a positional word part, i.e. part of a word that has a position on the transkription.
 """
 #    Copyright (C) University of Basel 2019  {{{1
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, either version 3 of the License, or
 #    (at your option) any later version.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <https://www.gnu.org/licenses/> 1}}}
 
 __author__ = "Christian Steiner"
 __maintainer__ = __author__
 __copyright__ = 'University of Basel'
 __email__ = "christian.steiner@unibas.ch"
 __status__ = "Development"
 __license__ = "GPL v3"
 __version__ = "0.0.1"
 
 from lxml import etree as ET
 from svgpathtools.parser import parse_path
 import warnings
 
 from .positional_object import PositionalObject 
 
 class PositionalWordPart(PositionalObject):
     """
     This class represents a positional word part, i.e. a part of a word that has a position on the transkription.
 
     Args:
 
         id (int):                   object id
         text (str):                 text
         symbol_id (str):            id of corresponding symbol
         style_class (str)           style class id
         matrix (datatypes.Matrix):  matrix containing information about conversion.
         height (float):             height of 
         width (float):              width of object
         x (float):                  x position of object
         y (float):                  y position of object
     """
     XML_TAG = 'word-part'
 
     def __init__(self, node=None, id=0, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, text=None, symbol_id=None, style_class=None):
         super(PositionalWordPart, self).__init__(id=id, node=node, height=height, width=width, x=x, y=y, matrix=matrix, tag=PositionalWordPart.XML_TAG)
         self.stringKeys += [ 'text', 'symbol_id', 'style_class' ]
         self.text = text
         self.symbol_id = symbol_id
         self.style_class = style_class
         if node is not None:
             self.text = node.get('text')
             self.symbol_id = node.get('symbol-id')
             self.style_class = node.get('style-class')
 
     @staticmethod
     def CREATE_POSITIONAL_WORD_PART(text, use_node, namespaces, start_id=0, xmin=0.0, ymin=0.0, matrix=None, style_class=None):
         """Creates a PositionalWordPart.
 
             [:return:] a PositionalWordPart
         """
         symbol_id = use_node.get('{%s}href' % namespaces['xlink']).replace('#', '') 
         x = float(use_node.get('x')) - xmin if bool(use_node.get('x')) else 0.0
         y = float(use_node.get('y')) - ymin if bool(use_node.get('y')) else 0.0
         d_strings = use_node.xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces)
         if len(d_strings) > 0:
             path = parse_path(d_strings[0])
             xmin, xmax, ymin, ymax = path.bbox()
             width = xmax - xmin
             height = ymax - ymin 
             return PositionalWordPart(id=start_id, text=text, height=height, width=width, x=x, y=y-height,\
                     matrix=matrix, symbol_id=symbol_id, style_class=style_class)
         else:
             return PositionalWordPart(id=start_id, text=text, x=x, y=y, matrix=matrix, symbol_id=symbol_id, style_class=style_class)
 
     @staticmethod
     def CREATE_POSITIONAL_WORD_PART_LIST(word_part_obj, svg_path_tree, namespaces, page, start_id=0, xmin=0.0, ymin=0.0):
         """Creates a list of PositionalWordPart from a word_part_obj (a dictionary with the keys: text, x, y, matrix, class),
             using a (lxml.ElementTree) svg_path_tree and the corresponding namespaces.
 
             [:return:] a list of PositionalWordPart
         """
         THRESHOLD = 0.4
         word_part_list = []
         x = float(word_part_obj['x']) if bool(word_part_obj.get('x')) else 0.0
         y = float(word_part_obj['y']) if bool(word_part_obj.get('y')) else 0.0
         text = word_part_obj.get('text')
         matrix = word_part_obj.get('matrix')
         style_class = word_part_obj.get('class')
         if text is not None and text != '':
             svg_x = x + xmin
             svg_y = y + ymin
             use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\
                     .format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces)
             if len(use_nodes) > 0:
                 current_use_node = use_nodes[0]
                 index = 0
                 word_part_list.append(PositionalWordPart.CREATE_POSITIONAL_WORD_PART(text[index], current_use_node, namespaces,\
                         start_id=start_id, xmin=xmin, ymin=ymin, matrix=matrix, style_class=style_class))
                 index, start_id = index+1, start_id+1
                 while index < len(text) and current_use_node.getnext() is not None:
                     current_use_node = current_use_node.getnext()
                     word_part_list.append(PositionalWordPart.CREATE_POSITIONAL_WORD_PART(text[index], current_use_node, namespaces,\
                         start_id=start_id, xmin=xmin, ymin=ymin, matrix=matrix, style_class=style_class))
                     index, start_id = index+1, start_id+1
                 if index < len(text) and current_use_node.getnext() is None:
                     last_pwp = word_part_list[len(word_part_list)-1]
                     word_part_obj['x'] = last_pwp.left + last_pwp.width + 0.5
-                    word_part_obj['y'] = last_pwp.top
-                    word_part_obj['text'] = last_pwp.text[index:]
+                    word_part_obj['y'] = last_pwp.bottom
+                    word_part_obj['text'] = word_part_obj['text'][index:]
                     word_part_list += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(word_part_obj,\
                             svg_path_tree, namespaces, page, start_id=start_id, xmin=xmin, ymin=ymin)
                 return word_part_list
             else:
                 warnings.warn('No use_node found for text {} svg_x {}, svg_y {}'.format(text, svg_x, svg_y))
                 return PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, [word_part_obj]) 
             #[ PositionalWordPart(id=start_id, text=text, x=x, y=y, matrix=matrix, style_class=style_class) ]
         else:
             return [ ]
 
     @staticmethod
     def CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs):
         """Creates a list of PositionalWordPart from word_part_objs (i.e. a list of dictionaries  
         with the keys: text, x, y, matrix, class).
 
             [:return:] a list of (datatypes.positional_word_part) PositionalWordPart
         """
         positional_word_parts = []
         HEIGHT_FACTOR = 1.1 # factor that multiplies font_size -> height
         FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize
         SPACING = 0.2
         for index, part_obj in enumerate(word_part_objs):
             text = part_obj.get('text')
             matrix = part_obj.get('matrix')
             style_class = part_obj.get('class')
             x = float(part_obj['x']) if bool(part_obj.get('x')) else 0.0
             y = float(part_obj['y']) if bool(part_obj.get('y')) else 0.0
             font_size = page.get_biggest_fontSize4styles(style_set=set(style_class.split(' ')))
             height = round(font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / font_size, 3)
             width = round(font_size * FONTWIDTHFACTOR, 3)
             if index+1 < len(word_part_objs) and bool(word_part_objs[index+1].get('x')):
                 width = float(word_part_objs[index+1]['x']) - x - SPACING
             positional_word_parts.append(PositionalWordPart(id=index, text=text, height=height, width=width, x=x, y=y, matrix=matrix, style_class=style_class))
         return positional_word_parts
Index: svgscripts/datatypes/transkription_position.py
===================================================================
--- svgscripts/datatypes/transkription_position.py	(revision 5)
+++ svgscripts/datatypes/transkription_position.py	(revision 6)
@@ -1,139 +1,139 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 """   This class can be used to represent a transkription word position.
 """
 #    Copyright (C) University of Basel 2019  {{{1
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, either version 3 of the License, or
 #    (at your option) any later version.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <https://www.gnu.org/licenses/> 1}}}
 
 __author__ = "Christian Steiner"
 __maintainer__ = __author__
 __copyright__ = 'University of Basel'
 __email__ = "christian.steiner@unibas.ch"
 __status__ = "Development"
 __license__ = "GPL v3"
 __version__ = "0.0.1"
 
 from lxml import etree as ET
 from os.path import isfile
 
 
 from .debug_message import DebugMessage
 from .positional_word_part import PositionalWordPart
 from .word_position import WordPosition 
 from .matrix import Matrix
 
 class TranskriptionPosition(WordPosition):
     """
     This class represents a transkription word position.
 
     Args:
         id (int):                       word id
         matrix (datatypes.Matrix):      matrix containing information about transformation.
         height (float):                 height of word
         width (float):                  width of word
         x (float):                      x position of word
         y (float):                      y position of word
         positional_word_parts           a list of (datatypes.positional_word_part) PositionalWordPart
         debug_message                   a (datatypes.debug_message) DebugMessage
     """
     ADD2X = 0.15
     ADD2TOP = 1.0
     ADD2BOTTOM = 0.2
     HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height
 
     def __init__(self, id=0, node=None, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, positional_word_parts=[], debug_message=None):
         super(WordPosition, self).__init__(id=id, node=node, height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION)
         self.positional_word_parts = positional_word_parts
         self.debug_message = debug_message
         if node is not None:
             self.debug_message = DebugMessage(node=node.xpath('.//' + DebugMessage.XML_TAG)[0])\
                     if len(node.xpath('.//' + DebugMessage.XML_TAG)) > 0 else None
             self.positional_word_parts = [ PositionalWordPart(node=pwp_node) for pwp_node in node.xpath('.//' + PositionalWordPart.XML_TAG) ] 
         self.attachable_objects += self.positional_word_parts
         if self.debug_message is not None:
             self.attachable_objects.append(self.debug_message)
 
     @staticmethod
     def CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, positional_word_parts, debug_message=None, debug_msg_string=None):
         """Creates a list of TranskriptionPosition from a list of (datatypes.positional_word_part) PositionalWordPart.
 
             [:return:] a list of (datatypes.transkription_position) TranskriptionPosition
         """
         TOPCORRECTION = 1
         debug_message = DebugMessage(message=debug_msg_string)\
                 if debug_msg_string is not None else debug_message
         transkription_positions = []
         if len(positional_word_parts) < 1:
             return []
         matrix = positional_word_parts[0].transform
         index = 0
         matrices_differ = False
         while index < len(positional_word_parts) and not matrices_differ:
             if Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix, positional_word_parts[index].transform):
                 matrices_differ = True
             else:
                 index += 1
         if matrices_differ and index < len(positional_word_parts):
             transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, positional_word_parts[index:])
             positional_word_parts = positional_word_parts[:index]
         if page.get_line_number((positional_word_parts[0].top + positional_word_parts[0].bottom)/2) % 2 == 0:
             all_styles = []
             for pwp in positional_word_parts:
                 all_styles += pwp.style_class.split(' ')
             biggest_font_size = page.get_biggest_fontSize4styles(style_set=set(all_styles))
             height = round(biggest_font_size * TranskriptionPosition.HEIGHT_FACTOR + TranskriptionPosition.HEIGHT_FACTOR / biggest_font_size, 3)
             TOPCORRECTION = 2 + TranskriptionPosition.HEIGHT_FACTOR / biggest_font_size
         else:
             # take greatest value for height
             height = [ pwp.height for pwp in sorted(positional_word_parts, key=lambda pwp: pwp.height, reverse=True)][0] + 2*TOPCORRECTION
         x = positional_word_parts[0].left - TranskriptionPosition.ADD2X
-        y = positional_word_parts[0].top - TOPCORRECTION
+        y = [ pwp.top for pwp in sorted(positional_word_parts, key=lambda pwp: pwp.top)][0] - TOPCORRECTION
         width = positional_word_parts[len(positional_word_parts)-1].left - x\
                 + positional_word_parts[len(positional_word_parts)-1].width + TranskriptionPosition.ADD2X
         for index, pwp in enumerate(positional_word_parts):
             pwp.id = index
         transkription_positions.insert(0, TranskriptionPosition(height=height, width=width, x=x, y=y, matrix=matrix,\
                 positional_word_parts=positional_word_parts, debug_message=debug_message)) 
         return transkription_positions
 
     @staticmethod
     def CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=None, debug_msg_string=None, transkription_field=None):
         """Creates a list of TranskriptionPosition from word_part_objs (i.e. a list of dictionaries  
         with the keys: text, x, y, matrix, class).
 
             [:return:] a list of (datatypes.transkription_position) TranskriptionPosition
         """
         positional_word_parts = []
         debug_message = DebugMessage(message=debug_msg_string)\
                 if debug_msg_string is not None else None
         if page.svg_file is not None and isfile(page.svg_file):
             svg_path_tree = ET.parse(page.svg_file)
             namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
             xmin = 0.0
             ymin = 0.0
             if transkription_field is not None:
                 xmin = transkription_field.xmin
                 ymin = transkription_field.ymin
             for part_obj in word_part_objs:
                 positional_word_parts += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(\
                         part_obj, svg_path_tree, namespaces, page, start_id=len(positional_word_parts),\
                         xmin=xmin, ymin=ymin)
         else:
             positional_word_parts = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs)
         if len(positional_word_parts) > 0:
             return TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(page, positional_word_parts, debug_message=debug_message)
         else:
             return [ TranskriptionPosition(matrix=matrix, debug_message=debug_message) ]
Index: svgscripts/extractWordPosition.py
===================================================================
--- svgscripts/extractWordPosition.py	(revision 5)
+++ svgscripts/extractWordPosition.py	(revision 6)
@@ -1,527 +1,528 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 """   This program can be used to extract the position of the words in a svg file and write them to a xml file.
 """
 #    Copyright (C) University of Basel 2019  {{{1
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, either version 3 of the License, or
 #    (at your option) any later version.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <https://www.gnu.org/licenses/> 1}}}
 
 import re
 import getopt
 import sys
 from os import sep, listdir, mkdir, path
 from os.path import exists, isfile, isdir
 from datetime import datetime
 from lxml import etree as ET
 from svgpathtools import svg2paths2
 import warnings
 
 from myxmlwriter import write_pretty
 from datatypes.lineNumber import LineNumber
 from datatypes.matrix import Matrix
 from datatypes.page import Page
 from datatypes.pdf import PDFText
 from datatypes.transkriptionField import TranskriptionField
 from datatypes.transkription_position import TranskriptionPosition
 from datatypes.word import Word
 from datatypes.word_insertion_mark import WordInsertionMark
 
 __author__ = "Christian Steiner"
 __maintainer__ = __author__
 __copyright__ = 'University of Basel'
 __email__ = "christian.steiner@unibas.ch"
 __status__ = "Development"
 __license__ = "GPL v3"
 __version__ = "0.0.1"
 
 class Extractor:
     """
     This class can be used to extract the word positions in a svg file and write it to a xml file.
 
     Args:
         [xml_dir (str): target directory]
         [title (str): title of document]
         [manuscript_file (str): xml file containing information about the archival unity to which the current page belongs
         [extract_transkription_field_only (Boolean): if true extract_word_position will extract word positions only that 
                                                      are part of the transkription field.
     """
     SONDERZEICHEN_LIST = [ 'A', 'B', '{', '}' ]
 
     def __init__(self, xml_dir=None, title=None, manuscript_file=None, extract_transkription_field_only=False):
         if bool(xml_dir):
             self.xml_dir = xml_dir
             not isdir(self.xml_dir) and mkdir(self.xml_dir)
         else:
             self.xml_dir = 'xml' if(isdir('xml')) else '' 
         self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else ''
         self.title = title
         self.manuscript_file = manuscript_file
         self.extract_transkription_field_only = extract_transkription_field_only
         self.manuscript_tree = None
         if not bool(self.title) and bool(self.manuscript_file) and isfile(self.manuscript_file):
             self.manuscript_tree = ET.parse(self.manuscript_file)
             self.title = self.manuscript_tree.getroot().get('title')
         elif bool(self.manuscript_file):
             raise FileNotFoundError('File "{}" does not exist!'.format(self.manuscript_file))
         elif bool(self.title):
             if not bool(self.manuscript_file):
                 self.manuscript_file = self.xml_dir + self.title.replace(' ', '_') + '.xml'
             if not isfile(self.manuscript_file):
                 self.manuscript_tree = ET.ElementTree(ET.Element('page', attrib={"title": self.title}))
                 write_pretty(xml_element_tree=self.manuscript_tree, file_name=self.manuscript_file, script_name=__file__, file_type='xmlManuscriptFile')
 
     def get_page_number(self, file_name, page_number=None):
         """ Returns page number as a string (with leading zero(s) if len(page_number) < 3).
         """
         if not bool(page_number) and bool(re.search(r'\d', file_name)):
             """if page_number=None and filename contains digits,
                 then split filename into its parts that contain only digits, remove empty strings
                 and return the last part containing only digits.
             """
             page_number = list(filter(lambda x: x != '', re.split(r'\D+', file_name))).pop()
         if bool(page_number):
             leading_zeros = '00' if(len(page_number) == 1) else '0' if(len(page_number) == 2) else ''
             return leading_zeros + str(page_number)
         else:
             return ''
 
     def get_file_name(self, file_name, page_number=None):
         """Returns the file_name of the target xml file.
         """
         dir_name = path.dirname(self.xml_dir) + sep if(bool(self.xml_dir)) else ''
         if bool(self.title):
             return dir_name + self.title.replace(' ', '_') + '_page' + self.get_page_number(file_name, page_number=page_number) + '.xml'
         else:
             return '{}{}'.format(dir_name, path.basename(file_name).replace('.svg', '.xml'))
 
     def get_style(self, etree_root):
         """Returns the style specification as a dictionary.
 
             :returns: 
                 sonderzeichen_list:     list of keys for classes that are 'Sonderzeichen'
                 style_dict:             dictionary: key = class name (str), value = style specification (dictionary)
         """
         style_dict = {}
         sonderzeichen_list = []
         letterspacing_list = []
         style = etree_root.find('style', etree_root.nsmap)
         if style is not None:
             for style_item in list(filter(lambda x: x != '', style.text.split("\n\t"))):
                 style_key = style_item.split('{')[0].replace('.', '')
                 style_value_dict = {  item.split(':')[0]: item.split(':')[1].replace('\'','') \
                     for item in list(filter(lambda x: x!= '', style_item.split('{')[1].replace('}', '').replace('\n','').split(';')))}
                 style_dict[style_key] = style_value_dict
                 if bool(style_value_dict.get('font-family')) and 'Sonderzeichen' in style_value_dict.get('font-family'):
                     sonderzeichen_list.append(style_key)
                 if bool(style_value_dict.get('letter-spacing')):
                     letterspacing_list.append(style_key)
         return sonderzeichen_list, letterspacing_list, style_dict
 
     def get_word_from_part_obj(self, word_part_obj):
         """Extracts all 'text' from a list of dicitonaries and concats it to a string.
         """
         return ''.join([ dict['text'] for dict in word_part_obj])
 
     def find_inserted_words_by_position(self, target_tree, x, y):
         """Returns an Array with the words that are inserted above the x, y position or [] if not found.
         """
         MINY = 31.0
         MAXY = 10.0
         DIFFX = 9.0
         if(len(target_tree.getroot().xpath('//word[@id]')) > 0):
             result_list = []
             minus2left = 20.0
             minus2top = 19.0
             while len(result_list) == 0 and minus2top < MINY and minus2left > DIFFX :
                 result_list = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\
                         '//word[@top>{0} and @top<{1} and @left>{2} and @left<{3}]'.format(y - minus2top, y - MAXY, x - minus2left, x + DIFFX)) ]
                 minus2left -= 1
                 minus2top  += 1
             if len(result_list) > 0:
                 result_bottom = result_list[len(result_list)-1].bottom
                 result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width
                 for item in target_tree.getroot().xpath('//word[@bottom={0} and @left>{1}]'.format(result_bottom, result_left_min)):
                     result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width
                     result_left_max = result_left_min + DIFFX
                     if float(item.get('left')) - result_left_max < DIFFX:
                         result_list.append(Word.CREATE_WORD(item))
                     else:
                         break
             return result_list 
         else:
             return []
 
     def find_inserted_words(self, target_tree, word_insertion_mark):
         """Returns an Array with the words that are inserted above the word_insertion_mark.
 
             TODO: get wim by line an split words above according to the gaps between them!!!
         """
         if word_insertion_mark.line_number < 2 or word_insertion_mark.line_number % 2 == 1:
             return self.find_inserted_words_by_position(target_tree, word_insertion_mark.x, word_insertion_mark.y)
         if(len(target_tree.getroot().xpath('//word[@id]')) > 0):
             MINY = 31.0
             MAXY = 10.0
             DIFFX = 9.0
             result_list = []
             line_number = word_insertion_mark.line_number - 1 
             x = word_insertion_mark.x
             y = word_insertion_mark.y
             if word_insertion_mark.mark_type != 'B': # all insertions that are above the current line
                 if len(target_tree.getroot().xpath('//word[@line-number={0}]'.format(line_number))) > 0:
                     minus2top = 1.0
                     while len(result_list) == 0 and minus2top < MINY:
                         result_list = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\
                                 '//word[@line-number={0} and @top>{1} and @left>{2} and @left<{3}]'.format(line_number, y - minus2top, x - DIFFX, x + DIFFX)) ]
                         minus2top  += 1
             elif word_insertion_mark.mark_type == 'B': # B means insertion is beneath the current line
                 line_number = word_insertion_mark.line_number + 1
                 if len(target_tree.getroot().xpath('//word[@line-number={0}]'.format(line_number))) > 0:
                     plus2top = 1.0
                     while len(result_list) == 0 and plus2top < MINY :
                         result_list = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\
                                 '//word[@line-number={0} and @top>={1} and @left>={2} and @left<={3}]'.format(line_number, y + plus2top, x - DIFFX, x + DIFFX)) ]
                         plus2top  += 1
             if len(result_list) > 0:
                 result_bottom = result_list[len(result_list)-1].transkription_positions[0].bottom
                 result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\
                         + result_list[len(result_list)-1].transkription_positions[0].width
                 for item in target_tree.getroot().xpath(\
                         '//word[@line-number={0} and @bottom>{1} and @bottom<{2} and @left>{3}]'.format(line_number, result_bottom-5, result_bottom+5, result_left_min)):
                     result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\
                             + result_list[len(result_list)-1].transkription_positions[0].width
                     result_left_max = result_left_min + DIFFX
                     if float(item.get('left')) - result_left_max < DIFFX:
                         result_list.append(Word.CREATE_WORD(item))
                     else:
                         break
             return result_list 
         else:
             return []
 
     def add_word(self, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None):
-        """Writes information about a word to xml_target_file.
+        """Creates transkription_positions and a new word from word_part_objs (i.e. a list of dictionaries about parts of this word).
+            If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, word_part_objs will be split and several words are created.
 
             :returns: the new word counter (int)
         """
         break_points = []
         if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points
             for Sonderzeichen in self.SONDERZEICHEN_LIST:
                 contains_Sonderzeichen = [ dict['text'] == Sonderzeichen and any(sz in dict['class'] for sz in page.sonderzeichen_list) for dict in word_part_objs ]
                 if True in contains_Sonderzeichen:
                     break_points += [ (endPoint, endPoint + 1) for endPoint in  [i for i, e in enumerate(contains_Sonderzeichen) if e == True ]] 
                     for sz_point in [i for i, e in break_points]:
                         wim_index = len(page.word_insertion_marks)
                         x = float(word_part_objs[sz_point]['x'])
                         y = float(word_part_objs[sz_point]['y'])
                         previous_word_id = index if (sz_point > 0) else -1
                         next_word_id = index + 1 if (index > -1) else index
                         page.word_insertion_marks.append(\
                                 WordInsertionMark(id=wim_index, x=x, y=y, line_number=page.get_line_number(y-1),\
                                 previous_word_id=previous_word_id, next_word_id=next_word_id, mark_type=Sonderzeichen))
         if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_part_obj(word_part_objs)))): # case: digits from line number and chars from words -> create break points
             THRESHOLDX = 20 # Threshold between line number and text
             last_x = -1
             for i, x in enumerate([float(dict['x']) for dict in word_part_objs]):
                 if(last_x > -1 and (x - last_x > THRESHOLDX)):
                     break_points.append((i, i))
                 last_x = x
         if(len(break_points) > 0): # if there are break points -> split word_part_obj and add the corresponding words
             from_index = 0
             for end_point, next_from_index in break_points:
                 new_word_part_objs = word_part_objs[from_index:end_point]
                 new_endX = word_part_objs[end_point]['x']
                 from_index = next_from_index
                 index = self.add_word(page, index, new_word_part_objs, None, new_endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
             if from_index > 0 and from_index < len(word_part_objs):
                 new_word_part_objs = word_part_objs[from_index:]
                 index = self.add_word(page, index, new_word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field)
             return index
         else:
             if len(word_part_objs) > 0:
                 transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=matrix,\
                         debug_msg_string=debug_msg, transkription_field=transkription_field)
                 text = self.get_word_from_part_obj(word_part_objs)
                 line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2)
                 newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions)
                 #newWord = Word.CREATE_WORD(page=page, word_part_objs=word_part_objs, id=index, endX=endX, endSign=endSign, matrix=matrix, debug_msg=debug_msg)
                 #newWord.attach_word_to_tree(page.page_tree) -> now we attach all words with update_and_attach_words2tree()
                 page.words.append(newWord)
                 return int(index) + 1
             else:
                 return int(index)
 
     def get_bottoms(self, tree_root, from_position=-1.0, to_position=-1.0, transkription_field=None):
         """Returns all unique bottom values (Float) as a sorted list.
         """
         bottom_list = sorted(set(item.get('transform').split(' ')[5].replace(')','') for item in tree_root.findall(".//text", tree_root.nsmap)), key=float)
         if transkription_field is not None:
             from_position = transkription_field.ymin
             to_position = transkription_field.ymax
         if (from_position > 0.0 and to_position > 0.0):
             return [ item for item in filter(lambda x: float(x) > from_position and float(x) < to_position, bottom_list) ] 
         else:
             return bottom_list
 
     def get_text_items(self, tree_root, transkription_field=None):
         """Returns all text elements with a matrix or (if transkription_field is specified) 
         all text elements that are located inside the transkription field.
         """
         if transkription_field is not None:
             return filter(lambda x: Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(x.get('transform'), transkription_field),\
                     tree_root.iterfind(".//text", tree_root.nsmap))
         else:
             return tree_root.iterfind(".//text", tree_root.nsmap)
        
     def extract_line_numbers(self, svg_tree, transkription_field):
         """Extracts line numbers and write them to a xml file.
         """
         nodes_near_tf = [ item for item in filter(lambda x: Matrix.IS_NEARX_TRANSKRIPTION_FIELD(x.get('transform'), transkription_field),\
                 svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
         line_numbers = [ LineNumber(raw_text_node=item, transkription_field=transkription_field)\
                 for item in filter(lambda x: LineNumber.IS_A_LINE_NUMBER(x), nodes_near_tf)]
         if len(line_numbers) > 0:
             MINABOVE = 3
             last_to_position = transkription_field.ymin
             for line_number in line_numbers:
                 above_current_line_bottom = line_number.bottom + transkription_field.ymin - MINABOVE
                 bottoms = self.get_bottoms(svg_tree.getroot(), from_position=last_to_position, to_position=above_current_line_bottom)
                 last_to_position = above_current_line_bottom
                 if len(bottoms) > 0:
                     current_line_top = float(bottoms[len(bottoms)-1]) - transkription_field.ymin + MINABOVE
                     line_number.setTop(current_line_top)
         return line_numbers
 
     def get_word_object_multi_char_x(self, word_part_obj_dict):
         """Returns the x of the last char of word_part_object.
 
         TODO: get real widths from svg_file!!!
         """
         WIDTHFACTOR = 2.6
         return word_part_obj_dict['x'] if len(word_part_obj_dict['text']) < 2 else word_part_obj_dict['x'] + len(word_part_obj_dict['text']) * WIDTHFACTOR
 
     def extract_word_position(self, svg_tree, page, transkription_field=None):
         """Extracts word positions.
         """
         counter = 0
         word_part_obj = []
         endSign = '%'
         last_matrix = None
         MAXBOTTOMDIFF = 5
         MAXXDIFF = 6
         for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field):
             current_matrix = Matrix(text_item.get('transform'), transkription_field=transkription_field)
             # check for line breaks
             if (last_matrix is not None and len(word_part_obj) > 0 and (\
                     Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\
                     (abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\
                     (abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\
                     or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()):
                 endSign = '%'
                 if(self.get_word_from_part_obj(word_part_obj) != ''):
                     debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\
-                            abs(current_matrix.getX()  - word_part_obj[len(word_part_obj)-1]['x']), abs(current_matrix.getY() - last_matrix.getY()),\
+                            round(abs(current_matrix.getX()  - word_part_obj[len(word_part_obj)-1]['x']), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\
                             str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix)))
                     counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg=debug_msg, transkription_field=transkription_field)
                 word_part_obj = []
             endX = current_matrix.getX()
             if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: <svg><text>TEXT
                 if(bool(text_item.text) and not bool(re.search(r'^\s*$', text_item.text))):
                     word_part_obj.append( { "text": text_item.text, "x": current_matrix.getX(), "y": current_matrix.getY(), "class": text_item.get('class'), "matrix": current_matrix} )
                 else:
                     endSign = text_item.text
                     if(self.get_word_from_part_obj(word_part_obj) != ''):
                         counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg='svg/text/\s', transkription_field=transkription_field)  
                     word_part_obj = []
                     endSign = '%'
             for tspan_item in text_item.findall(".//tspan", svg_tree.getroot().nsmap): # case: <svg><text><tspan>TEXT
                 endX = current_matrix.add2X(tspan_item.get('x'))
                 if(tspan_item.text != None and tspan_item.text != '' and not bool(re.search(r'^\s*$', tspan_item.text))):
                     y = current_matrix.add2Y(tspan_item.get('y'))
                     word_part_obj.append( { "text": tspan_item.text, "x": endX, "y": y, "class": tspan_item.get('class'), "matrix": current_matrix })
                     if len(set(page.letterspacing_list) & set(tspan_item.get('class').split(' '))) > 0: # text_item has letterspacing class 
                         endSign = '%'
                         if(self.get_word_from_part_obj(word_part_obj) != ''):
                             counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\
                                     debug_msg='tspan with letterspacing', transkription_field=transkription_field)  
                         word_part_obj = []
                 else:
                     endSign = tspan_item.text
                     if(self.get_word_from_part_obj(word_part_obj) != ''):
                         counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\
                                 debug_msg='svg/text/tspan/\s', transkription_field=transkription_field)
                     word_part_obj = []
                     endSign = '%'
             last_matrix = current_matrix
         if(self.get_word_from_part_obj(word_part_obj) != ''):
             counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix, debug_msg='end of loop',\
                     transkription_field=transkription_field)
         word_part_obj = []
         endSign = '%'
     
     def update_and_attach_words2tree(self, page):
         """Update word ids and attach them to page.page_tree.
         """
         for node in page.page_tree.xpath('//word'): 
             node.getparent().remove(node)
         for index, word in enumerate(page.words):
             word.id = index
             word.attach_word_to_tree(page.page_tree)
 
     def extract_information(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None):
         """Extracts information about positions of text elements.
 
             [:returns:] (datatypes.page) the Page containing all information.
         """
         if isfile(file_name):
             if not bool(xml_target_file):
                 xml_target_file = self.get_file_name(file_name, page_number)
             if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
                 xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
             transkription_field = TranskriptionField(file_name) if bool(self.extract_transkription_field_only) else None
             svg_tree = ET.parse(file_name) 
             page = Page(xml_target_file=xml_target_file, title=self.title, page_number=page_number, pdfFile=pdfFile,\
                     svg_file=svg_file, extract_transkription_field_only=self.extract_transkription_field_only)
             sonderzeichen_list, letterspacing_list, style_dict = self.get_style(svg_tree.getroot())
             page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict)
             if transkription_field is not None:
                 page.init_line_numbers(self.extract_line_numbers(svg_tree, transkription_field), transkription_field.ymax)
             self.extract_word_position(svg_tree, page, transkription_field=transkription_field)
             if page.pdfFile is not None and isfile(page.pdfFile):
                 pdftext = PDFText(page.pdfFile, sonderzeichen=self.SONDERZEICHEN_LIST)
                 pdftext.compare_svgWords2pdfWords(page, transkription_field=transkription_field)
             self.update_and_attach_words2tree(page)
             for word_insertion_mark in page.word_insertion_marks:
                 word_insertion_mark.inserted_words = self.find_inserted_words(page.page_tree, word_insertion_mark) 
                 word_insertion_mark.attach_object_to_tree(page.page_tree)
             return page
         else:
             raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
 
     def extractAndWriteInformation(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None):
         """Extracts information about positions of text elements and writes them to a xml file.
         """
         if isfile(file_name):
             if not bool(xml_target_file):
                 xml_target_file = self.get_file_name(file_name, page_number)
             if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)):
                 xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file
             page = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file, pdfFile=pdfFile) 
             write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type='svgWordPosition')
             return 0 
         else:
             raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name))
 
 def usage():
     """prints information on how to use the script
     """
     print(main.__doc__)
 
 def main(argv):
     """This program can be used to extract the position of the words in a svg file and write them to a xml file. 
 
     svgscripts/extractWordPosition.py [OPTIONS] <file|dir>
 
         <file>                              svg file OR xml target file containing file name of svg file as "/page/@source".
         <dir>                               directory containing svg files
 
         OPTIONS:
         -h|--help:                          show help
         -d|--xml-dir=xmlDir:                target directory for the xml output file(s)
         -m|--manuscript-file:               xml file containing information about the archival order to which the current page(s) belong(s)
         -o|--only-transkription-field:      extract only words that are part of the transkription field.
         -p|--page=pageNumber:               page number of the current page. For use with _one_ file only.
         -P|--PDF=pdfFile:                   pdf file - used for word correction
         -s|--svg=svgFile:                   svg web file
         -t|--title=title:                   title of the manuscript to which the current page(s) belong(s)
         -x|--xml-target-file=xmlOutputFile: xml target file 
 
         :return: exit code (int)
     """
     extract_transkription_field_only = False
     manuscript_file = None
     page_number = None
     pdfFile = None
     svg_file = None
     title = None
     xml_target_file = None
     xml_dir = ".{}xml".format(sep)
 
     try:
         opts, args = getopt.getopt(argv, "hod:m:t:p:s:x:P:", ["help", "only-transkription-field", "xml-dir=", "manuscript-file=", "title=", "page=", "svg=", "xml-target-file=", "PDF="])
     except getopt.GetoptError:
         usage()
         return 2
 
     for opt, arg in opts:
         if opt in ('-h', '--help') or not args:
             usage()
             return 0
         elif opt in ('-o', '--only-transkription-field'):
             extract_transkription_field_only = True
         elif opt in ('-d', '--xml-dir'):
             xml_dir = arg
         elif opt in ('-m', '--manuscript-file'):
             manuscript_file = arg
         elif opt in ('-t', '--title'):
             title = arg
         elif opt in ('-p', '--page'):
             page_number = str(arg)
         elif opt in ('-s', '--svg'):
             svg_file = arg
         elif opt in ('-P', '--PDF'):
             pdfFile = arg
         elif opt in ('-x', '--xml-target-file'):
             xml_target_file = str(arg)
     files_to_process = list()
     for arg in args:
         if isfile(arg):
             files_to_process.append(arg)
         elif isdir(arg):
             files_to_process = files_to_process + list(filter(lambda file: '.svg' in file, listdir(arg))) 
         else:
             print("'{}' does not exist!".format(arg))
             return 2
     
     if len(files_to_process) < 1 or args[0].endswith('xml'):
         if xml_target_file is None:
             xml_target_file = args[0] if len(args) > 0 else None
         if xml_target_file is not None and isfile(xml_target_file):
             target_file_tree = ET.parse(xml_target_file)
             file_name = target_file_tree.getroot().get('source')
             title = target_file_tree.getroot().get('title') if title is None else title
             page_number = target_file_tree.getroot().get('number') if page_number is None else page_number
             extract_transkription_field_only = (target_file_tree.getroot().get('transkription-field-only') == 'true')\
                     if target_file_tree.getroot().get('transkription-field-only') is not None else False
             if svg_file is None:
                 svg_file = target_file_tree.xpath('.//svg/@file')[0]\
                         if len(target_file_tree.xpath('.//svg/@file')) > 0 else None
             files_to_process.insert(0, file_name)
             if xml_target_file in files_to_process:
                 files_to_process.remove(xml_target_file)
         else:
             usage()
             return 2
     if len(files_to_process) > 1 and (bool(page_number) or bool(xml_target_file) or bool(pdfFile) or bool(svg_file)):
         print("ERROR: too many input files:  options --PDF, --page, --svg and --xml-target-file presuppose only one input file!")
         usage()
         return 2
 
     extractor = Extractor(xml_dir=xml_dir, title=title, manuscript_file=manuscript_file, extract_transkription_field_only=extract_transkription_field_only)
     for file in files_to_process:
         extractor.extractAndWriteInformation(file, page_number=page_number, xml_target_file=xml_target_file, pdfFile=pdfFile, svg_file=svg_file)
     return 0
 
 if __name__ == "__main__":
     sys.exit(main(sys.argv[1:]))