textutils.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, May 25, 13:10

textutils.py
View Options

	# -- coding: utf-8 --

	# This file is part of Invenio.
	# Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2013 CERN.
	#
	# Invenio is free software; you can redistribute it and/or
	# modify it under the terms of the GNU General Public License as
	# published by the Free Software Foundation; either version 2 of the
	# License, or (at your option) any later version.
	#
	# Invenio is distributed in the hope that it will be useful, but
	# WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with Invenio; if not, write to the Free Software Foundation, Inc.,
	# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""
	Functions useful for text wrapping (in a box) and indenting.
	"""

	__revision__ = "$Id$"

	import sys
	import re
	import textwrap
	import htmlentitydefs
	import invenio.template
	from invenio.config import CFG_ETCDIR
	try:
	import chardet
	CHARDET_AVAILABLE = True
	except ImportError:
	CHARDET_AVAILABLE = False

	from unidecode import unidecode

	CFG_LATEX_UNICODE_TRANSLATION_CONST = {}

	CFG_WRAP_TEXT_IN_A_BOX_STYLES = {
	'__DEFAULT' : {
	'horiz_sep' : '*',
	'max_col' : 72,
	'min_col' : 40,
	'tab_str' : ' ',
	'tab_num' : 0,
	'border' : ('*', '', '', ' ', ' ', '', '', '*'),
	'prefix' : '\n',
	'suffix' : '\n',
	'break_long' : False,
	'force_horiz' : False,
	},
	'squared' : {
	'horiz_sep' : '-',
	'border' : ('+', '-', '+', '\| ', ' \|', '+', '-', '+')
	},
	'double_sharp' : {
	'horiz_sep' : '#',
	'border' : ('##', '#', '##', '## ', ' ##', '##', '#', '##')
	},
	'single_sharp' : {
	'horiz_sep' : '#',
	'border' : ('#', '#', '#', '# ', ' #', '#', '#', '#')
	},
	'single_star' : {
	'border' : ('', '', '', ' ', ' ', '', '', '',)
	},
	'double_star' : {
	},
	'no_border' : {
	'horiz_sep' : '',
	'border' : ('', '', '', '', '', '', '', ''),
	'prefix' : '',
	'suffix' : ''
	},
	'conclusion' : {
	'border' : ('', '', '', '', '', '', '', ''),
	'prefix' : '',
	'horiz_sep' : '-',
	'force_horiz' : True,
	},
	'important' : {
	'tab_num' : 1,
	},
	'ascii' : {
	'horiz_sep' : (u'├', u'─', u'┤'),
	'border' : (u'┌', u'─', u'┐', u'│ ', u' │', u'└', u'─', u'┘'),
	},
	'ascii_double' : {
	'horiz_sep' : (u'╠', u'═', u'╣'),
	'border' : (u'╔', u'═', u'╗', u'║ ', u' ║', u'╚', u'═', u'╝'),
	}

	}

	re_unicode_lowercase_a = re.compile(unicode(r"(?u)[áàäâãå]", "utf-8"))
	re_unicode_lowercase_ae = re.compile(unicode(r"(?u)[æ]", "utf-8"))
	re_unicode_lowercase_oe = re.compile(unicode(r"(?u)[œ]", "utf-8"))
	re_unicode_lowercase_e = re.compile(unicode(r"(?u)[éèëê]", "utf-8"))
	re_unicode_lowercase_i = re.compile(unicode(r"(?u)[íìïî]", "utf-8"))
	re_unicode_lowercase_o = re.compile(unicode(r"(?u)[óòöôõø]", "utf-8"))
	re_unicode_lowercase_u = re.compile(unicode(r"(?u)[úùüû]", "utf-8"))
	re_unicode_lowercase_y = re.compile(unicode(r"(?u)[ýÿ]", "utf-8"))
	re_unicode_lowercase_c = re.compile(unicode(r"(?u)[çć]", "utf-8"))
	re_unicode_lowercase_n = re.compile(unicode(r"(?u)[ñ]", "utf-8"))
	re_unicode_lowercase_ss = re.compile(unicode(r"(?u)[ß]", "utf-8"))
	re_unicode_uppercase_a = re.compile(unicode(r"(?u)[ÁÀÄÂÃÅ]", "utf-8"))
	re_unicode_uppercase_ae = re.compile(unicode(r"(?u)[Æ]", "utf-8"))
	re_unicode_uppercase_oe = re.compile(unicode(r"(?u)[Œ]", "utf-8"))
	re_unicode_uppercase_e = re.compile(unicode(r"(?u)[ÉÈËÊ]", "utf-8"))
	re_unicode_uppercase_i = re.compile(unicode(r"(?u)[ÍÌÏÎ]", "utf-8"))
	re_unicode_uppercase_o = re.compile(unicode(r"(?u)[ÓÒÖÔÕØ]", "utf-8"))
	re_unicode_uppercase_u = re.compile(unicode(r"(?u)[ÚÙÜÛ]", "utf-8"))
	re_unicode_uppercase_y = re.compile(unicode(r"(?u)[Ý]", "utf-8"))
	re_unicode_uppercase_c = re.compile(unicode(r"(?u)[ÇĆ]", "utf-8"))
	re_unicode_uppercase_n = re.compile(unicode(r"(?u)[Ñ]", "utf-8"))
	re_latex_lowercase_a = re.compile("\\\\[\"H'`~^vu=k]\{?a\}?")
	re_latex_lowercase_ae = re.compile("\\\\ae\\{\\}?")
	re_latex_lowercase_oe = re.compile("\\\\oe\\{\\}?")
	re_latex_lowercase_e = re.compile("\\\\[\"H'`~^vu=k]\\{?e\\}?")
	re_latex_lowercase_i = re.compile("\\\\[\"H'`~^vu=k]\\{?i\\}?")
	re_latex_lowercase_o = re.compile("\\\\[\"H'`~^vu=k]\\{?o\\}?")
	re_latex_lowercase_u = re.compile("\\\\[\"H'`~^vu=k]\\{?u\\}?")
	re_latex_lowercase_y = re.compile("\\\\[\"']\\{?y\\}?")
	re_latex_lowercase_c = re.compile("\\\\['uc]\\{?c\\}?")
	re_latex_lowercase_n = re.compile("\\\\[c'~^vu]\\{?n\\}?")
	re_latex_uppercase_a = re.compile("\\\\[\"H'`~^vu=k]\\{?A\\}?")
	re_latex_uppercase_ae = re.compile("\\\\AE\\{?\\}?")
	re_latex_uppercase_oe = re.compile("\\\\OE\\{?\\}?")
	re_latex_uppercase_e = re.compile("\\\\[\"H'`~^vu=k]\\{?E\\}?")
	re_latex_uppercase_i = re.compile("\\\\[\"H'`~^vu=k]\\{?I\\}?")
	re_latex_uppercase_o = re.compile("\\\\[\"H'`~^vu=k]\\{?O\\}?")
	re_latex_uppercase_u = re.compile("\\\\[\"H'`~^vu=k]\\{?U\\}?")
	re_latex_uppercase_y = re.compile("\\\\[\"']\\{?Y\\}?")
	re_latex_uppercase_c = re.compile("\\\\['uc]\\{?C\\}?")
	re_latex_uppercase_n = re.compile("\\\\[c'~^vu]\\{?N\\}?")

	def indent_text(text,
	nb_tabs=0,
	tab_str=" ",
	linebreak_input="\n",
	linebreak_output="\n",
	wrap=False):
	"""
	add tabs to each line of text
	@param text: the text to indent
	@param nb_tabs: number of tabs to add
	@param tab_str: type of tab (could be, for example "\t", default: 2 spaces
	@param linebreak_input: linebreak on input
	@param linebreak_output: linebreak on output
	@param wrap: wethever to apply smart text wrapping.
	(by means of wrap_text_in_a_box)
	@return: indented text as string
	"""
	if not wrap:
	lines = text.split(linebreak_input)
	tabs = nb_tabs*tab_str
	output = ""
	for line in lines:
	output += tabs + line + linebreak_output
	return output
	else:
	return wrap_text_in_a_box(body=text, style='no_border',
	tab_str=tab_str, tab_num=nb_tabs)

	_RE_BEGINNING_SPACES = re.compile(r'^\s*')
	_RE_NEWLINES_CLEANER = re.compile(r'\n+')
	_RE_LONELY_NEWLINES = re.compile(r'\b\n\b')
	def wrap_text_in_a_box(body='', title='', style='double_star', **args):
	"""Return a nicely formatted text box:
	e.g.
	******************
	title
	--------------
	body
	******************

	Indentation and newline are respected.
	@param body: the main text
	@param title: an optional title
	@param style: the name of one of the style in CFG_WRAP_STYLES. By default
	the double_star style is used.

	You can further tune the desired style by setting various optional
	parameters:
	@param horiz_sep: a string that is repeated in order to produce a
	separator row between the title and the body (if needed)
	or a tuple of three characters in the form (l, c, r)
	@param max_col: the maximum number of coulmns used by the box
	(including indentation)
	@param min_col: the symmetrical minimum number of columns
	@param tab_str: a string to represent indentation
	@param tab_num: the number of leveles of indentations
	@param border: a tuple of 8 element in the form
	(tl, t, tr, l, r, bl, b, br) of strings that represent the
	different corners and sides of the box
	@param prefix: a prefix string added before the box
	@param suffix: a suffix string added after the box
	@param break_long: wethever to break long words in order to respect
	max_col
	@param force_horiz: True in order to print the horizontal line even when
	there is no title

	e.g.:
	print wrap_text_in_a_box(title='prova',
	body=' 123 prova.\n Vediamo come si indenta',
	horiz_sep='-', style='no_border', max_col=20, tab_num=1)

	prova
	----------------
	123 prova.
	Vediamo come
	si indenta

	"""

	def _wrap_row(row, max_col, break_long):
	"""Wrap a single row"""
	spaces = _RE_BEGINNING_SPACES.match(row).group()
	row = row[len(spaces):]
	spaces = spaces.expandtabs()
	return textwrap.wrap(row, initial_indent=spaces,
	subsequent_indent=spaces, width=max_col,
	break_long_words=break_long)

	def _clean_newlines(text):
	text = _RE_LONELY_NEWLINES.sub(' \n', text)
	return _RE_NEWLINES_CLEANER.sub(lambda x: x.group()[:-1], text)

	body = unicode(body, 'utf-8')
	title = unicode(title, 'utf-8')

	astyle = dict(CFG_WRAP_TEXT_IN_A_BOX_STYLES['__DEFAULT'])
	if CFG_WRAP_TEXT_IN_A_BOX_STYLES.has_key(style):
	astyle.update(CFG_WRAP_TEXT_IN_A_BOX_STYLES[style])
	astyle.update(args)

	horiz_sep = astyle['horiz_sep']
	border = astyle['border']
	tab_str = astyle['tab_str'] * astyle['tab_num']
	max_col = max(astyle['max_col'] \
	- len(border[3]) - len(border[4]) - len(tab_str), 1)
	min_col = astyle['min_col']
	prefix = astyle['prefix']
	suffix = astyle['suffix']
	force_horiz = astyle['force_horiz']
	break_long = astyle['break_long']

	body = _clean_newlines(body)
	tmp_rows = [_wrap_row(row, max_col, break_long)
	for row in body.split('\n')]
	body_rows = []
	for rows in tmp_rows:
	if rows:
	body_rows += rows
	else:
	body_rows.append('')
	if not ''.join(body_rows).strip():
	# Concrete empty body
	body_rows = []

	title = _clean_newlines(title)
	tmp_rows = [_wrap_row(row, max_col, break_long)
	for row in title.split('\n')]
	title_rows = []
	for rows in tmp_rows:
	if rows:
	title_rows += rows
	else:
	title_rows.append('')
	if not ''.join(title_rows).strip():
	# Concrete empty title
	title_rows = []

	max_col = max([len(row) for row in body_rows + title_rows] + [min_col])

	mid_top_border_len = max_col \
	+ len(border[3]) + len(border[4]) - len(border[0]) - len(border[2])
	mid_bottom_border_len = max_col \
	+ len(border[3]) + len(border[4]) - len(border[5]) - len(border[7])
	top_border = border[0] \
	+ (border[1] * mid_top_border_len)[:mid_top_border_len] + border[2]
	bottom_border = border[5] \
	+ (border[6] * mid_bottom_border_len)[:mid_bottom_border_len] \
	+ border[7]
	if type(horiz_sep) is tuple and len(horiz_sep) == 3:
	horiz_line = horiz_sep[0] + (horiz_sep[1] * (max_col + 2))[:(max_col + 2)] + horiz_sep[2]
	else:
	horiz_line = border[3] + (horiz_sep * max_col)[:max_col] + border[4]

	title_rows = [tab_str + border[3] + row
	+ ' ' * (max_col - len(row)) + border[4] for row in title_rows]
	body_rows = [tab_str + border[3] + row
	+ ' ' * (max_col - len(row)) + border[4] for row in body_rows]

	ret = []
	if top_border:
	ret += [tab_str + top_border]
	ret += title_rows
	if title_rows or force_horiz:
	ret += [tab_str + horiz_line]
	ret += body_rows
	if bottom_border:
	ret += [tab_str + bottom_border]
	return (prefix + '\n'.join(ret) + suffix).encode('utf-8')

	def wait_for_user(msg=""):
	"""
	Print MSG and a confirmation prompt, waiting for user's
	confirmation, unless silent '--yes-i-know' command line option was
	used, in which case the function returns immediately without
	printing anything.
	"""
	if '--yes-i-know' in sys.argv:
	return
	print msg
	try:
	answer = raw_input("Please confirm by typing 'Yes, I know!': ")
	except KeyboardInterrupt:
	print
	answer = ''
	if answer != 'Yes, I know!':
	sys.stderr.write("ERROR: Aborted.\n")
	sys.exit(1)
	return

	def guess_minimum_encoding(text, charsets=('ascii', 'latin1', 'utf8')):
	"""Try to guess the minimum charset that is able to represent the given
	text using the provided charsets. text is supposed to be encoded in utf8.
	Returns (encoded_text, charset) where charset is the first charset
	in the sequence being able to encode text.
	Returns (text_in_utf8, 'utf8') in case no charset is able to encode text.

	@note: If the input text is not in strict UTF-8, then replace any
	non-UTF-8 chars inside it.
	"""
	text_in_unicode = text.decode('utf8', 'replace')
	for charset in charsets:
	try:
	return (text_in_unicode.encode(charset), charset)
	except (UnicodeEncodeError, UnicodeDecodeError):
	pass
	return (text_in_unicode.encode('utf8'), 'utf8')

	def encode_for_xml(text, wash=False, xml_version='1.0', quote=False):
	"""Encodes special characters in a text so that it would be
	XML-compliant.
	@param text: text to encode
	@return: an encoded text"""
	text = text.replace('&', '&')
	text = text.replace('<', '<')
	if quote:
	text = text.replace('"', '"')
	if wash:
	text = wash_for_xml(text, xml_version=xml_version)
	return text

	try:
	unichr(0x100000)
	RE_ALLOWED_XML_1_0_CHARS = re.compile(u'[^\U00000009\U0000000A\U0000000D\U00000020-\U0000D7FF\U0000E000-\U0000FFFD\U00010000-\U0010FFFF]')
	RE_ALLOWED_XML_1_1_CHARS = re.compile(u'[^\U00000001-\U0000D7FF\U0000E000-\U0000FFFD\U00010000-\U0010FFFF]')
	except ValueError:
	# oops, we are running on a narrow UTF/UCS Python build,
	# so we have to limit the UTF/UCS char range:
	RE_ALLOWED_XML_1_0_CHARS = re.compile(u'[^\U00000009\U0000000A\U0000000D\U00000020-\U0000D7FF\U0000E000-\U0000FFFD]')
	RE_ALLOWED_XML_1_1_CHARS = re.compile(u'[^\U00000001-\U0000D7FF\U0000E000-\U0000FFFD]')

	def wash_for_xml(text, xml_version='1.0'):
	"""
	Removes any character which is not in the range of allowed
	characters for XML. The allowed characters depends on the version
	of XML.

	- XML 1.0:
	<http://www.w3.org/TR/REC-xml/#charsets>
	- XML 1.1:
	<http://www.w3.org/TR/xml11/#charsets>

	@param text: input string to wash.
	@param xml_version: version of the XML for which we wash the
	input. Value for this parameter can be '1.0' or '1.1'
	"""
	if xml_version == '1.0':
	return RE_ALLOWED_XML_1_0_CHARS.sub('', unicode(text, 'utf-8')).encode('utf-8')
	else:
	return RE_ALLOWED_XML_1_1_CHARS.sub('', unicode(text, 'utf-8')).encode('utf-8')

	def wash_for_utf8(text, correct=True):
	"""Return UTF-8 encoded binary string with incorrect characters washed away.

	@param text: input string to wash (can be either a binary string or a Unicode string)
	@param correct: whether to correct bad characters or throw exception
	"""
	if isinstance(text, unicode):
	return text.encode('utf-8')

	errors = "ignore" if correct else "strict"
	return text.decode("utf-8", errors).encode("utf-8", errors)

	def nice_size(size):
	"""
	@param size: the size.
	@type size: int
	@return: a nicely printed size.
	@rtype: string
	"""
	websearch_templates = invenio.template.load('websearch')
	unit = 'B'
	if size > 1024:
	size /= 1024.0
	unit = 'KB'
	if size > 1024:
	size /= 1024.0
	unit = 'MB'
	if size > 1024:
	size /= 1024.0
	unit = 'GB'
	return '%s %s' % (websearch_templates.tmpl_nice_number(size, max_ndigits_after_dot=2), unit)

	def remove_line_breaks(text):
	"""
	Remove line breaks from input, including unicode 'line
	separator', 'paragraph separator', and 'next line' characters.
	"""
	return unicode(text, 'utf-8').replace('\f', '').replace('\n', '').replace('\r', '').replace(u'\xe2\x80\xa8', '').replace(u'\xe2\x80\xa9', '').replace(u'\xc2\x85', '').encode('utf-8')

	def decode_to_unicode(text, default_encoding='utf-8'):
	"""
	Decode input text into Unicode representation by first using the default
	encoding utf-8.
	If the operation fails, it detects the type of encoding used in the given text.
	For optimal result, it is recommended that the 'chardet' module is installed.
	NOTE: Beware that this might be slow for very large strings.

	If chardet detection fails, it will try to decode the string using the basic
	detection function guess_minimum_encoding().

	Also, bear in mind that it is impossible to detect the correct encoding at all
	times, other then taking educated guesses. With that said, this function will
	always return some decoded Unicode string, however the data returned may not
	be the same as original data in some cases.

	@param text: the text to decode
	@type text: string

	@param default_encoding: the character encoding to use. Optional.
	@type default_encoding: string

	@return: input text as Unicode
	@rtype: string
	"""
	if not text:
	return ""
	try:
	return text.decode(default_encoding)
	except (UnicodeError, LookupError):
	pass
	detected_encoding = None
	if CHARDET_AVAILABLE:
	# We can use chardet to perform detection
	res = chardet.detect(text)
	if res['confidence'] >= 0.8:
	detected_encoding = res['encoding']
	if detected_encoding == None:
	# No chardet detection, try to make a basic guess
	dummy, detected_encoding = guess_minimum_encoding(text)
	return text.decode(detected_encoding)

	def translate_latex2unicode(text, kb_file="%s/bibconvert/KB/latex-to-unicode.kb" % \
	(CFG_ETCDIR,)):
	"""
	This function will take given text, presumably containing LaTeX symbols,
	and attempts to translate it to Unicode using the given or default KB
	translation table located under CFG_ETCDIR/bibconvert/KB/latex-to-unicode.kb.
	The translated Unicode string will then be returned.

	If the translation table and compiled regular expression object is not
	previously generated in the current session, they will be.

	@param text: a text presumably containing LaTeX symbols.
	@type text: string

	@param kb_file: full path to file containing latex2unicode translations.
	Defaults to CFG_ETCDIR/bibconvert/KB/latex-to-unicode.kb
	@type kb_file: string

	@return: Unicode representation of translated text
	@rtype: unicode
	"""
	# First decode input text to Unicode
	try:
	text = decode_to_unicode(text)
	except UnicodeDecodeError:
	text = unicode(wash_for_utf8(text))
	# Load translation table, if required
	if CFG_LATEX_UNICODE_TRANSLATION_CONST == {}:
	_load_latex2unicode_constants(kb_file)
	# Find all matches and replace text
	for match in CFG_LATEX_UNICODE_TRANSLATION_CONST['regexp_obj'].finditer(text):
	# If LaTeX style markers {, } and $ are before or after the matching text, it
	# will replace those as well
	text = re.sub("[\{\$]?%s[\}\$]?" % (re.escape(match.group()),), \
	CFG_LATEX_UNICODE_TRANSLATION_CONST['table'][match.group()], \
	text)
	# Return Unicode representation of translated text
	return text

	def _load_latex2unicode_constants(kb_file="%s/bibconvert/KB/latex-to-unicode.kb" % \
	(CFG_ETCDIR,)):
	"""
	Load LaTeX2Unicode translation table dictionary and regular expression object
	from KB to a global dictionary.

	@param kb_file: full path to file containing latex2unicode translations.
	Defaults to CFG_ETCDIR/bibconvert/KB/latex-to-unicode.kb
	@type kb_file: string

	@return: dict of type: {'regexp_obj': regexp match object,
	'table': dict of LaTeX -> Unicode mappings}
	@rtype: dict
	"""
	try:
	data = open(kb_file)
	except IOError:
	# File not found or similar
	sys.stderr.write("\nCould not open LaTeX to Unicode KB file. Aborting translation.\n")
	return CFG_LATEX_UNICODE_TRANSLATION_CONST
	latex_symbols = []
	translation_table = {}
	for line in data:
	# The file has form of latex\|--\|utf-8. First decode to Unicode.
	line = line.decode('utf-8')
	mapping = line.split('\|--\|')
	translation_table[mapping[0].rstrip('\n')] = mapping[1].rstrip('\n')
	latex_symbols.append(re.escape(mapping[0].rstrip('\n')))
	data.close()
	CFG_LATEX_UNICODE_TRANSLATION_CONST['regexp_obj'] = re.compile("\|".join(latex_symbols))
	CFG_LATEX_UNICODE_TRANSLATION_CONST['table'] = translation_table

	def translate_to_ascii(values):
	"""
	Transliterate the string contents of the given sequence into ascii representation.
	Returns a sequence with the modified values if the module 'unidecode' is
	available. Otherwise it will fall back to the inferior strip_accents function.

	For example: H\xc3\xb6hne becomes Hohne.

	Note: Passed strings are returned as a list.

	@param values: sequence of strings to transform
	@type values: sequence

	@return: sequence with values transformed to ascii
	@rtype: sequence
	"""
	if not values and not type(values) == str:
	return values

	if type(values) == str:
	values = [values]
	for index, value in enumerate(values):
	if not value:
	continue
	unicode_text = decode_to_unicode(value)
	if u"[?]" in unicode_text:
	decoded_text = []
	for unicode_char in unicode_text:
	decoded_char = unidecode(unicode_char)
	# Skip unrecognized characters
	if decoded_char != "[?]":
	decoded_text.append(decoded_char)
	ascii_text = ''.join(decoded_text).encode('ascii')
	else:
	ascii_text = unidecode(unicode_text).replace(u"[?]", u"").encode('ascii')
	values[index] = ascii_text
	return values

	def xml_entities_to_utf8(text, skip=('lt', 'gt', 'amp')):
	"""
	Removes HTML or XML character references and entities from a text string
	and replaces them with their UTF-8 representation, if possible.

	@param text: The HTML (or XML) source text.
	@type text: string

	@param skip: list of entity names to skip when transforming.
	@type skip: iterable

	@return: The plain text, as a Unicode string, if necessary.
	@author: Based on http://effbot.org/zone/re-sub.htm#unescape-html
	"""
	def fixup(m):
	text = m.group(0)
	if text[:2] == "&#":
	# character reference
	try:
	if text[:3] == "&#x":
	return unichr(int(text[3:-1], 16)).encode("utf-8")
	else:
	return unichr(int(text[2:-1])).encode("utf-8")
	except ValueError:
	pass
	else:
	# named entity
	if text[1:-1] not in skip:
	try:
	text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode("utf-8")
	except KeyError:
	pass
	return text # leave as is
	return re.sub("&#?\w+;", fixup, text)

	def strip_accents(x):
	"""
	Strip accents in the input phrase X (assumed in UTF-8) by replacing
	accented characters with their unaccented cousins (e.g. é by e).

	@param x: the input phrase to strip.
	@type x: string

	@return: Return such a stripped X.
	"""
	x = re_latex_lowercase_a.sub("a", x)
	x = re_latex_lowercase_ae.sub("ae", x)
	x = re_latex_lowercase_oe.sub("oe", x)
	x = re_latex_lowercase_e.sub("e", x)
	x = re_latex_lowercase_i.sub("i", x)
	x = re_latex_lowercase_o.sub("o", x)
	x = re_latex_lowercase_u.sub("u", x)
	x = re_latex_lowercase_y.sub("x", x)
	x = re_latex_lowercase_c.sub("c", x)
	x = re_latex_lowercase_n.sub("n", x)
	x = re_latex_uppercase_a.sub("A", x)
	x = re_latex_uppercase_ae.sub("AE", x)
	x = re_latex_uppercase_oe.sub("OE", x)
	x = re_latex_uppercase_e.sub("E", x)
	x = re_latex_uppercase_i.sub("I", x)
	x = re_latex_uppercase_o.sub("O", x)
	x = re_latex_uppercase_u.sub("U", x)
	x = re_latex_uppercase_y.sub("Y", x)
	x = re_latex_uppercase_c.sub("C", x)
	x = re_latex_uppercase_n.sub("N", x)

	# convert input into Unicode string:
	try:
	y = unicode(x, "utf-8")
	except:
	return x # something went wrong, probably the input wasn't UTF-8
	# asciify Latin-1 lowercase characters:
	y = re_unicode_lowercase_a.sub("a", y)
	y = re_unicode_lowercase_ae.sub("ae", y)
	y = re_unicode_lowercase_oe.sub("oe", y)
	y = re_unicode_lowercase_e.sub("e", y)
	y = re_unicode_lowercase_i.sub("i", y)
	y = re_unicode_lowercase_o.sub("o", y)
	y = re_unicode_lowercase_u.sub("u", y)
	y = re_unicode_lowercase_y.sub("y", y)
	y = re_unicode_lowercase_c.sub("c", y)
	y = re_unicode_lowercase_n.sub("n", y)
	y = re_unicode_lowercase_ss.sub("ss", y)
	# asciify Latin-1 uppercase characters:
	y = re_unicode_uppercase_a.sub("A", y)
	y = re_unicode_uppercase_ae.sub("AE", y)
	y = re_unicode_uppercase_oe.sub("OE", y)
	y = re_unicode_uppercase_e.sub("E", y)
	y = re_unicode_uppercase_i.sub("I", y)
	y = re_unicode_uppercase_o.sub("O", y)
	y = re_unicode_uppercase_u.sub("U", y)
	y = re_unicode_uppercase_y.sub("Y", y)
	y = re_unicode_uppercase_c.sub("C", y)
	y = re_unicode_uppercase_n.sub("N", y)
	# return UTF-8 representation of the Unicode string:
	return y.encode("utf-8")


	def show_diff(original, modified, prefix='', suffix='',
	prefix_unchanged=' ',
	suffix_unchanged='',
	prefix_removed='-',
	suffix_removed='',
	prefix_added='+',
	suffix_added=''):
	"""
	Returns the diff view between original and modified strings.
	Function checks both arguments line by line and returns a string
	with a:
	- prefix_unchanged when line is common to both sequences
	- prefix_removed when line is unique to sequence 1
	- prefix_added when line is unique to sequence 2
	and a corresponding suffix in each line
	@param original: base string
	@param modified: changed string
	@param prefix: prefix of the output string
	@param suffix: suffix of the output string
	@param prefix_unchanged: prefix of the unchanged line
	@param suffix_unchanged: suffix of the unchanged line
	@param prefix_removed: prefix of the removed line
	@param suffix_removed: suffix of the removed line
	@param prefix_added: prefix of the added line
	@param suffix_added: suffix of the added line

	@return: string with the comparison of the records
	@rtype: string
	"""
	import difflib
	differ = difflib.Differ()

	result = [prefix]
	for line in differ.compare(modified.splitlines(), original.splitlines()):
	if line[0] == ' ':
	# Mark as unchanged
	result.append(prefix_unchanged + line[2:].strip() + suffix_unchanged)
	elif line[0] == '-':
	# Mark as removed
	result.append(prefix_removed + line[2:].strip() + suffix_removed)
	elif line[0] == '+':
	# Mark as added/modified
	result.append(prefix_added + line[2:].strip() + suffix_added)

	result.append(suffix)
	return '\n'.join(result)


	def transliterate_ala_lc(value):
	"""
	Transliterate a string.
	Compatibility with the ALA-LC romanization standard:
	http://www.loc.gov/catdir/cpso/roman.html

	Maps from one system of writing into another, letter by letter.
	Uses 'unidecode' if available.

	@param values: string to transform
	@type values: string

	@return: string transliterated
	@rtype: string
	"""
	if not value:
	return value
	text = unidecode(value)
	return text


	def escape_latex(text):
	"""
	This function takes the given text and escapes characters
	that have a special meaning in LaTeX: # $ % ^ & _ { } ~ \
	"""
	text = unicode(text.decode('utf-8'))
	CHARS = {
	'&': r'\&',
	'%': r'\%',
	'$': r'\$',
	'#': r'\#',
	'_': r'\_',
	'{': r'\{',
	'}': r'\}',
	'~': r'\~{}',
	'^': r'\^{}',
	'\\': r'\textbackslash{}',
	}
	escaped = "".join([CHARS.get(char, char) for char in text])
	return escaped.encode('utf-8')

textutils.pyNo OneTemporaryActions

File Metadata

textutils.pyView Options

Event Timeline

textutils.py
No OneTemporary
Actions

textutils.py
View Options