diff --git a/modules/elmsubmit/lib/.cvsignore b/modules/elmsubmit/lib/.cvsignore index c5cd3a479..9638520ce 100644 --- a/modules/elmsubmit/lib/.cvsignore +++ b/modules/elmsubmit/lib/.cvsignore @@ -1,7 +1,6 @@ Makefile Makefile.in z_* *.O *~ -*.py *.pyc \ No newline at end of file diff --git a/modules/elmsubmit/lib/Makefile.am b/modules/elmsubmit/lib/Makefile.am index ce219489f..bb46db047 100644 --- a/modules/elmsubmit/lib/Makefile.am +++ b/modules/elmsubmit/lib/Makefile.am @@ -1,31 +1,28 @@ ## $Id$ ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. SUBDIRS = magic pylibdir = $(libdir)/python/cdsware -pylib_DATA = elmsubmit.py elmsubmit_misc.py config2pyobj.py elmsubmit_doctype_test.py elmsubmit_enriched2txt.py elmsubmit_EZArchive.py elmsubmit_EZEmail.py elmsubmit_field_validation.py elmsubmit_filename_generator.py elmsubmit_html2txt.py elmsubmit_misc.py elmsubmit_richtext2txt.py elmsubmit_submission_parser.py lex.py lextab.py myhtmlentitydefs.py parsetab.py yacc.py mime.types.edited parser.out -FILESWML = $(wildcard $(srcdir)/*.wml) -EXTRA_DIST = $(FILESWML:$(srcdir)/%=%) mime.types.edited parser.out +pylib_DATA = elmsubmit.py elmsubmit_misc.py config2pyobj.py elmsubmit_doctype_test.py elmsubmit_enriched2txt.py elmsubmit_EZArchive.py elmsubmit_EZEmail.py elmsubmit_field_validation.py elmsubmit_filename_generator.py elmsubmit_html2txt.py elmsubmit_misc.py elmsubmit_richtext2txt.py elmsubmit_submission_parser.py lex.py lextab.py myhtmlentitydefs.py parsetab.py yacc.py mime.types.edited parser.out -CLEANFILES = *.py *~ *.tmp *.pyc +EXTRA_DIST = $(pylib_DATA) mime.types.edited parser.out -%.py: %.py.wml $(top_srcdir)/config/config.wml $(top_builddir)/config/configbis.wml - $(WML) -o $@ $< \ No newline at end of file +CLEANFILES = *~ *.tmp *.pyc \ No newline at end of file diff --git a/modules/elmsubmit/lib/config2pyobj.py b/modules/elmsubmit/lib/config2pyobj.py index c212a2171..41a2ecc84 100644 --- a/modules/elmsubmit/lib/config2pyobj.py +++ b/modules/elmsubmit/lib/config2pyobj.py @@ -1,125 +1,122 @@ -# -*- coding: utf-8 -*- - -## $Id$ - +# -*- coding: utf-8 -*- +## +## $Id$ +## ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - import ConfigParser import re # Config file functions: limited_python_identifier = r'^[a-zA-Z][a-zA-Z_0-9]*$' class Config(object): def __init__(self, config_name, config_dict): self._config_name = config_name f = lambda (section_name, section_dict): config_dict.update({section_name : ConfigSection(config_name, section_name, section_dict)}) map(f, config_dict.items()) self._config_dict = config_dict def __getattr__(self, section_name): try: return self._config_dict[section_name] except: raise ConfigGetSectionError("config object '%s' does not contain section '%s'" % (self._config_name, section_name)) def __setattr__(self, attr, value): if attr in ['_config_name', '_config_dict']: self.__dict__[attr] = value else: raise ConfigSetError('this class provides read only config access: setting attributes is not possible') class ConfigSection(object): def __init__(self, config_name, section_name, section_dict): self._config_name = config_name self._section_name = section_name self._section_dict = section_dict def __getattr__(self, key_name): try: return self._section_dict[key_name] except KeyError: raise ConfigGetKeyError("config object '%s', section '%s' does not contain key '%s'" % (self._config_name, self._section_name, key_name)) def __setattr__(self, attr, value): if attr in ['_config_name', '_section_name', '_section_dict']: self.__dict__[attr] = value else: raise ConfigSetError('this class provides read only config access: setting attributes is not possible') def configobj(files, name=None): try: # file[0][0] does not work to see if we have a list of string # like objects, since pythons character are just strings. # hence string[0][0][0][0][0][0][0][0]... is valid. files[0][1] except: raise ValueError("function configobj has first argument named files: this must be a list of filenames") if name is None: name = files[0] conf = ConfigParser.ConfigParser() # Read configuration files: map(lambda file: conf.readfp(open(file, 'rb')), files) # Make sure each section and key in the config file is a valid # python identifier: for section in conf.sections(): if not re.search(limited_python_identifier, section): raise ConfigParseError(("config section '%s' is not a valid python identifier string matching regexp " + r"r'^[a-zA-Z][a-zA-Z_0-9]*$'") % (section)) for item, value in conf.items(section): if not re.search(limited_python_identifier, item): raise ConfigParseError(("key '%s' in config section '%s' is not a valid python identifier string matching regexp " + r"r'^[a-zA-Z][a-zA-Z_0-9]*$'") % (item, section)) # Create a dictionary from their contents: config_dict = {} f = lambda section: config_dict.update({ section : dict(conf.items(section)) }) map(f, conf.sections()) return Config(name, config_dict) class ConfigError(Exception): pass class ConfigParseError(ConfigError): pass class ConfigSetError(ConfigError): pass class ConfigGetError(ConfigError): pass class ConfigGetSectionError(ConfigGetError): pass class ConfigGetKeyError(ConfigGetError): pass - + diff --git a/modules/elmsubmit/lib/config2pyobj.py.wml b/modules/elmsubmit/lib/config2pyobj.py.wml deleted file mode 100644 index c212a2171..000000000 --- a/modules/elmsubmit/lib/config2pyobj.py.wml +++ /dev/null @@ -1,125 +0,0 @@ -# -*- coding: utf-8 -*- - -## $Id$ - -## This file is part of the CERN Document Server Software (CDSware). -## Copyright (C) 2002, 2003, 2004, 2005 CERN. -## -## The CDSware is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License as -## published by the Free Software Foundation; either version 2 of the -## License, or (at your option) any later version. -## -## The CDSware is distributed in the hope that it will be useful, but -## WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -## General Public License for more details. -## -## You should have received a copy of the GNU General Public License -## along with CDSware; if not, write to the Free Software Foundation, Inc., -## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - -import ConfigParser -import re - -# Config file functions: - -limited_python_identifier = r'^[a-zA-Z][a-zA-Z_0-9]*$' - -class Config(object): - - def __init__(self, config_name, config_dict): - self._config_name = config_name - - f = lambda (section_name, section_dict): config_dict.update({section_name : ConfigSection(config_name, section_name, section_dict)}) - map(f, config_dict.items()) - - self._config_dict = config_dict - - def __getattr__(self, section_name): - try: - return self._config_dict[section_name] - except: - raise ConfigGetSectionError("config object '%s' does not contain section '%s'" % (self._config_name, section_name)) - - def __setattr__(self, attr, value): - if attr in ['_config_name', '_config_dict']: - self.__dict__[attr] = value - else: - raise ConfigSetError('this class provides read only config access: setting attributes is not possible') - -class ConfigSection(object): - def __init__(self, config_name, section_name, section_dict): - self._config_name = config_name - self._section_name = section_name - self._section_dict = section_dict - - def __getattr__(self, key_name): - try: - return self._section_dict[key_name] - except KeyError: - raise ConfigGetKeyError("config object '%s', section '%s' does not contain key '%s'" % (self._config_name, self._section_name, key_name)) - - def __setattr__(self, attr, value): - if attr in ['_config_name', '_section_name', '_section_dict']: - self.__dict__[attr] = value - else: - raise ConfigSetError('this class provides read only config access: setting attributes is not possible') - -def configobj(files, name=None): - try: - # file[0][0] does not work to see if we have a list of string - # like objects, since pythons character are just strings. - # hence string[0][0][0][0][0][0][0][0]... is valid. - files[0][1] - except: - raise ValueError("function configobj has first argument named files: this must be a list of filenames") - - if name is None: - name = files[0] - - conf = ConfigParser.ConfigParser() - # Read configuration files: - map(lambda file: conf.readfp(open(file, 'rb')), files) - - # Make sure each section and key in the config file is a valid - # python identifier: - for section in conf.sections(): - if not re.search(limited_python_identifier, section): - raise ConfigParseError(("config section '%s' is not a valid python identifier string matching regexp " + - r"r'^[a-zA-Z][a-zA-Z_0-9]*$'") % (section)) - - for item, value in conf.items(section): - if not re.search(limited_python_identifier, item): - raise ConfigParseError(("key '%s' in config section '%s' is not a valid python identifier string matching regexp " + - r"r'^[a-zA-Z][a-zA-Z_0-9]*$'") % (item, section)) - - # Create a dictionary from their contents: - config_dict = {} - f = lambda section: config_dict.update({ section : dict(conf.items(section)) }) - map(f, conf.sections()) - - return Config(name, config_dict) - -class ConfigError(Exception): - pass - -class ConfigParseError(ConfigError): - pass - -class ConfigSetError(ConfigError): - pass - -class ConfigGetError(ConfigError): - pass - -class ConfigGetSectionError(ConfigGetError): - pass - -class ConfigGetKeyError(ConfigGetError): - pass - - diff --git a/modules/elmsubmit/lib/elmsubmit.py b/modules/elmsubmit/lib/elmsubmit.py index 079b3ddf6..8b219e971 100644 --- a/modules/elmsubmit/lib/elmsubmit.py +++ b/modules/elmsubmit/lib/elmsubmit.py @@ -1,248 +1,245 @@ -# -*- coding: utf-8 -*- - -## $Id$ - +# -*- coding: utf-8 -*- +## +## $Id$ +## ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - # import sys import os import os.path import cdsware.config2pyobj as config2pyobj import re import smtplib import cdsware.elmsubmit_EZEmail as elmsubmit_EZEmail import cdsware.elmsubmit_submission_parser as elmsubmit_submission_parser import cdsware.elmsubmit_field_validation as elmsubmit_field_validation from cdsware.elmsubmit_misc import random_alphanum_string as _random_alphanum_string from cdsware.elmsubmit_misc import import_dots as _import_dots # Import the config file: from cdsware.config import etcdir #_this_module = sys.modules[__name__] #_this_module_dir = os.path.abspath(os.path.dirname(_this_module.__file__)) elmconf = config2pyobj.configobj([os.path.join(etcdir, 'elmsubmit', 'elmsubmit.cfg')]) def process_email(email_string): # See if we can parse the email: try: e = elmsubmit_EZEmail.ParseMessage(email_string) except elmsubmit_EZEmail.EZEmailParseError, err: try: if err.basic_email_info['from'] is None: raise ValueError response = elmsubmit_EZEmail.CreateMessage(to=err.basic_email_info['from'], _from=elmconf.people.admin, message=elmconf.nolangmsgs.bad_email, subject="Re: " + (err.basic_email_info.get('Subject', '') or ''), references=[err.basic_email_info.get('message-id', '') or ''], wrap_message=False) _send_smtp(_from=elmconf.people.admin, to=err.basic_email_info['from'], msg=response) raise elmsubmitError("Email could not be parsed. Reported to sender.") except ValueError: raise elmsubmitError("From: field of submission email could not be parsed. Could not report to sender.") # See if we can parse the submission fields in the email: try: # Note that this returns a dictionary loaded with utf8 byte strings: (submission_dict, dummy_var) = elmsubmit_submission_parser.parse_submission(e.primary_message.encode('utf8')) # Add the submitter's email: submission_dict['SuE'] = e.from_email.encode('utf8') except elmsubmit_submission_parser.SubmissionParserError: _notify(msg=e, response=elmconf.nolangmsgs.bad_submission) raise elmsubmitSubmissionError("Could not parse submission.") # See if we can find a recognized document type specified by the TYPE field: try: doctype = submission_dict['type'] handler_module_name = elmconf.sub_handlers.__getattr__(doctype) except KeyError: _notify(msg=e, response=elmconf.nolangmsgs.missing_type) raise elmsubmitSubmissionError("Submission does not specify document type.") except config2pyobj.ConfigGetKeyError: _notify(msg=e, response=elmconf.nolangmsgs.unsupported_type) raise elmsubmitSubmissionError("Submission specifies unrecognized document type.") # See if we can import the python module containing a handler for # the document type: handler_module = _import_dots('cdsware.' + handler_module_name) handler_function = getattr(handler_module, 'handler') required_fields = getattr(handler_module, 'required_fields') # Check we have been given the required fields: available_fields = submission_dict.keys() if not len(filter(lambda x: x in available_fields, required_fields)) == len(required_fields): response = elmconf.nolangmsgs.missing_fields_1 + (' %s ' % (doctype)) + elmconf.nolangmsgs.missing_fields_2 + "\n\n" + repr(required_fields) _notify(msg=e, response=response) raise elmsubmitSubmissionError("Submission does not contain the required fields for document type %s. Required fields: %s" % (doctype, required_fields)) # Check that the fields we have been given validate OK: map(lambda field: validate_submission_field(e, submission_dict, field, submission_dict[field]), required_fields) # Map the fields to their proper storage names: def f((field, value)): try: field = elmconf.field_mappings.__getattr__(field) except config2pyobj.ConfigGetKeyError: # No mapping defined for field: pass return(field, value) submission_dict = dict(map(f, submission_dict.items())) # Let the handler function process the email: (response_email, admin_response_email, error) = handler_function(msg=e, submission_dict=submission_dict, elmconf=elmconf) # Reply to the sender if there was a problem: if response_email is not None: _notify(msg=e, response=response_email) # Reply to the admin if there was a failure: if admin_response_email is not None: _notify_admin(response=admin_response_email) if error is not None: raise error def validate_submission_field(msg, submission_dict, field, value): try: (field_documentation, fixed_value, ok) = getattr(elmsubmit_field_validation, field)(value.decode('utf8')) submission_dict[field] = fixed_value.encode('utf8') if not ok: _notify(msg=msg, response=elmconf.nolangmsgs.bad_field + ' ' + field.upper() + '\n\n' + elmconf.nolangmsgs.correct_format + '\n\n' + field_documentation) raise elmsubmitSubmissionError("Submission contains field %s which does not validate." % (field)) except AttributeError: # No validation defined for this field: pass def get_storage_dir(msg, doctype): path = os.path.join(elmconf.files.maildir, doctype, _random_alphanum_string(15)) while os.path.exists(path): path = os.path.join(elmconf.files.maildir, doctype, _random_alphanum_string(15)) try: os.mkdir(path) except EnvironmentError: _notify(msg=msg, response=elmconf.nolangmsgs.temp_problem) _notify_admin(response="Could not create directory: %s" % (path)) raise elmsubmitError("Could not create directory: %s" % (path)) return path def process_files(msg, submission_dict): files = map(lambda filename: filename.decode('utf8'), submission_dict['files'].splitlines()) # Check for the special filename 'all': if we find it, add all of # the files attached to the email to the list of files to submit: if 'all' in files: f = lambda attachment: attachment['filename'] is not None g = lambda attachment: attachment['filename'].lower() attached_filenames = map(g, filter(f, msg.attachments)) files.extend(attached_filenames) files = filter(lambda name: name != 'all', files) # Filter out duplicate filenames: _temp = {} map(lambda filename: _temp.update({ filename : 1}), files) files = _temp.keys() # Get the files out of the mail message: file_list = {} for filename in files: # See if we have special keyword self (which uses the mail message itself as the file): if filename == 'self': file = msg.original_message filename = _random_alphanum_string(8) + '_' + msg.date_sent_utc.replace(' ', '_').replace(':', '-') + '.msg' else: nominal_attachments = filter(lambda attachment: attachment['filename'].lower() == filename, msg.attachments) try: file = nominal_attachments[0]['file'] except IndexError: _notify(msg=msg, response=elmconf.nolangmsgs.missing_attachment + ' ' + filename) raise elmsubmitSubmissionError("Submission is missing attached file: %s" % (filename)) file_list[filename.encode('utf8')] = file submission_dict['files'] = file_list def _send_smtp(_from, to, msg): s = smtplib.SMTP() s.connect(host=elmconf.servers.smtp) s.sendmail(_from, to, msg) s.close() def _notify(msg, response): response = elmsubmit_EZEmail.CreateMessage(to=[(msg.from_name, msg.from_email)], _from=elmconf.people.admin, message=response, subject="Re: " + msg.subject, references=[msg.message_id], wrap_message=False) _send_smtp(_from=elmconf.people.admin, to=msg.from_email, msg=response) def _notify_admin(response): response = elmsubmit_EZEmail.CreateMessage(to=elmconf.people.admin, _from=elmconf.people.admin, message=response, subject="CDSWare / elmsubmit problem.", wrap_message=False) _send_smtp(_from=elmconf.people.admin, to=elmconf.people.admin, msg=response) class elmsubmitError(Exception): pass class elmsubmitSubmissionError(elmsubmitError): pass class _elmsubmitPrivateError(Exception): """ An emtpy parent class for all the private errors in this module. """ pass - + diff --git a/modules/elmsubmit/lib/elmsubmit.py.wml b/modules/elmsubmit/lib/elmsubmit.py.wml deleted file mode 100644 index 079b3ddf6..000000000 --- a/modules/elmsubmit/lib/elmsubmit.py.wml +++ /dev/null @@ -1,248 +0,0 @@ -# -*- coding: utf-8 -*- - -## $Id$ - -## This file is part of the CERN Document Server Software (CDSware). -## Copyright (C) 2002, 2003, 2004, 2005 CERN. -## -## The CDSware is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License as -## published by the Free Software Foundation; either version 2 of the -## License, or (at your option) any later version. -## -## The CDSware is distributed in the hope that it will be useful, but -## WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -## General Public License for more details. -## -## You should have received a copy of the GNU General Public License -## along with CDSware; if not, write to the Free Software Foundation, Inc., -## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - -# import sys -import os -import os.path -import cdsware.config2pyobj as config2pyobj -import re -import smtplib - -import cdsware.elmsubmit_EZEmail as elmsubmit_EZEmail -import cdsware.elmsubmit_submission_parser as elmsubmit_submission_parser -import cdsware.elmsubmit_field_validation as elmsubmit_field_validation -from cdsware.elmsubmit_misc import random_alphanum_string as _random_alphanum_string -from cdsware.elmsubmit_misc import import_dots as _import_dots - -# Import the config file: -from cdsware.config import etcdir - -#_this_module = sys.modules[__name__] -#_this_module_dir = os.path.abspath(os.path.dirname(_this_module.__file__)) -elmconf = config2pyobj.configobj([os.path.join(etcdir, 'elmsubmit', 'elmsubmit.cfg')]) - -def process_email(email_string): - - # See if we can parse the email: - - try: - e = elmsubmit_EZEmail.ParseMessage(email_string) - except elmsubmit_EZEmail.EZEmailParseError, err: - try: - if err.basic_email_info['from'] is None: - raise ValueError - response = elmsubmit_EZEmail.CreateMessage(to=err.basic_email_info['from'], - _from=elmconf.people.admin, - message=elmconf.nolangmsgs.bad_email, - subject="Re: " + (err.basic_email_info.get('Subject', '') or ''), - references=[err.basic_email_info.get('message-id', '') or ''], - wrap_message=False) - _send_smtp(_from=elmconf.people.admin, to=err.basic_email_info['from'], msg=response) - raise elmsubmitError("Email could not be parsed. Reported to sender.") - except ValueError: - raise elmsubmitError("From: field of submission email could not be parsed. Could not report to sender.") - - # See if we can parse the submission fields in the email: - - try: - # Note that this returns a dictionary loaded with utf8 byte strings: - (submission_dict, dummy_var) = elmsubmit_submission_parser.parse_submission(e.primary_message.encode('utf8')) - # Add the submitter's email: - submission_dict['SuE'] = e.from_email.encode('utf8') - - except elmsubmit_submission_parser.SubmissionParserError: - _notify(msg=e, response=elmconf.nolangmsgs.bad_submission) - raise elmsubmitSubmissionError("Could not parse submission.") - - # See if we can find a recognized document type specified by the TYPE field: - - try: - doctype = submission_dict['type'] - handler_module_name = elmconf.sub_handlers.__getattr__(doctype) - except KeyError: - _notify(msg=e, response=elmconf.nolangmsgs.missing_type) - raise elmsubmitSubmissionError("Submission does not specify document type.") - except config2pyobj.ConfigGetKeyError: - _notify(msg=e, response=elmconf.nolangmsgs.unsupported_type) - raise elmsubmitSubmissionError("Submission specifies unrecognized document type.") - - # See if we can import the python module containing a handler for - # the document type: - handler_module = _import_dots('cdsware.' + handler_module_name) - handler_function = getattr(handler_module, 'handler') - required_fields = getattr(handler_module, 'required_fields') - - # Check we have been given the required fields: - available_fields = submission_dict.keys() - - if not len(filter(lambda x: x in available_fields, required_fields)) == len(required_fields): - response = elmconf.nolangmsgs.missing_fields_1 + (' %s ' % (doctype)) + elmconf.nolangmsgs.missing_fields_2 + "\n\n" + repr(required_fields) - _notify(msg=e, response=response) - raise elmsubmitSubmissionError("Submission does not contain the required fields for document type %s. Required fields: %s" % (doctype, required_fields)) - - # Check that the fields we have been given validate OK: - - map(lambda field: validate_submission_field(e, submission_dict, field, submission_dict[field]), required_fields) - - # Map the fields to their proper storage names: - - def f((field, value)): - try: - field = elmconf.field_mappings.__getattr__(field) - except config2pyobj.ConfigGetKeyError: - # No mapping defined for field: - pass - - return(field, value) - - submission_dict = dict(map(f, submission_dict.items())) - - # Let the handler function process the email: - - (response_email, admin_response_email, error) = handler_function(msg=e, submission_dict=submission_dict, elmconf=elmconf) - - # Reply to the sender if there was a problem: - - if response_email is not None: - _notify(msg=e, response=response_email) - - # Reply to the admin if there was a failure: - - if admin_response_email is not None: - _notify_admin(response=admin_response_email) - - if error is not None: - raise error - -def validate_submission_field(msg, submission_dict, field, value): - - try: - (field_documentation, fixed_value, ok) = getattr(elmsubmit_field_validation, field)(value.decode('utf8')) - submission_dict[field] = fixed_value.encode('utf8') - - if not ok: - _notify(msg=msg, response=elmconf.nolangmsgs.bad_field + ' ' + field.upper() + '\n\n' - + elmconf.nolangmsgs.correct_format + '\n\n' + field_documentation) - raise elmsubmitSubmissionError("Submission contains field %s which does not validate." % (field)) - except AttributeError: - # No validation defined for this field: - pass - -def get_storage_dir(msg, doctype): - - path = os.path.join(elmconf.files.maildir, doctype, _random_alphanum_string(15)) - while os.path.exists(path): - path = os.path.join(elmconf.files.maildir, doctype, _random_alphanum_string(15)) - - try: - os.mkdir(path) - except EnvironmentError: - _notify(msg=msg, response=elmconf.nolangmsgs.temp_problem) - _notify_admin(response="Could not create directory: %s" % (path)) - raise elmsubmitError("Could not create directory: %s" % (path)) - return path - -def process_files(msg, submission_dict): - - files = map(lambda filename: filename.decode('utf8'), submission_dict['files'].splitlines()) - - # Check for the special filename 'all': if we find it, add all of - # the files attached to the email to the list of files to submit: - - if 'all' in files: - - f = lambda attachment: attachment['filename'] is not None - g = lambda attachment: attachment['filename'].lower() - attached_filenames = map(g, filter(f, msg.attachments)) - - files.extend(attached_filenames) - files = filter(lambda name: name != 'all', files) - - # Filter out duplicate filenames: - _temp = {} - map(lambda filename: _temp.update({ filename : 1}), files) - files = _temp.keys() - - # Get the files out of the mail message: - - file_list = {} - - for filename in files: - - # See if we have special keyword self (which uses the mail message itself as the file): - if filename == 'self': - file = msg.original_message - filename = _random_alphanum_string(8) + '_' + msg.date_sent_utc.replace(' ', '_').replace(':', '-') + '.msg' - else: - nominal_attachments = filter(lambda attachment: attachment['filename'].lower() == filename, msg.attachments) - - try: - file = nominal_attachments[0]['file'] - except IndexError: - _notify(msg=msg, response=elmconf.nolangmsgs.missing_attachment + ' ' + filename) - raise elmsubmitSubmissionError("Submission is missing attached file: %s" % (filename)) - - file_list[filename.encode('utf8')] = file - - submission_dict['files'] = file_list - - -def _send_smtp(_from, to, msg): - - s = smtplib.SMTP() - s.connect(host=elmconf.servers.smtp) - s.sendmail(_from, to, msg) - s.close() - -def _notify(msg, response): - response = elmsubmit_EZEmail.CreateMessage(to=[(msg.from_name, msg.from_email)], - _from=elmconf.people.admin, - message=response, - subject="Re: " + msg.subject, - references=[msg.message_id], - wrap_message=False) - - _send_smtp(_from=elmconf.people.admin, to=msg.from_email, msg=response) - -def _notify_admin(response): - response = elmsubmit_EZEmail.CreateMessage(to=elmconf.people.admin, - _from=elmconf.people.admin, - message=response, - subject="CDSWare / elmsubmit problem.", - wrap_message=False) - _send_smtp(_from=elmconf.people.admin, to=elmconf.people.admin, msg=response) - -class elmsubmitError(Exception): - pass - -class elmsubmitSubmissionError(elmsubmitError): - pass - -class _elmsubmitPrivateError(Exception): - """ - An emtpy parent class for all the private errors in this module. - """ - pass - - diff --git a/modules/elmsubmit/lib/elmsubmit_EZArchive.py b/modules/elmsubmit/lib/elmsubmit_EZArchive.py index 88e9decd2..5789c8b29 100644 --- a/modules/elmsubmit/lib/elmsubmit_EZArchive.py +++ b/modules/elmsubmit/lib/elmsubmit_EZArchive.py @@ -1,1040 +1,1034 @@ -# -*- coding: utf-8 -*- - -## $Id$ - +# -*- coding: utf-8 -*- +## +## $Id$ +## ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - - import gzip import bz2 import zipfile import tarfile import shutil import os import copy import re import tempfile import glob import sys WARN_SKIP = True from cdsware.elmsubmit_filename_generator import calculate_filename_extension as _calculate_filename_extension # from cdsware.elmsubmit_filename_generator import generate_filename as _generate_filename from cdsware.elmsubmit_misc import write_to_and_return_tempfile_name as _write_to_and_return_tempfile_name from cdsware.elmsubmit_misc import provide_dir_with_perms_then_exec as _provide_dir_with_perms_then_exec from cdsware.elmsubmit_misc import dirtree as _dirtree from cdsware.elmsubmit_misc import count_dotdot as _count_dotdot from cdsware.elmsubmit_misc import get_perms as _get_perms from cdsware.elmsubmit_misc import random_alphanum_string as _random_alphanum_string from cdsware.elmsubmit_misc import backup_directory as _backup_directory from cdsware.elmsubmit_misc import open_tempfile as _open_tempfile from cdsware.elmsubmit_misc import split_common_path as _split_common_path from cdsware.elmsubmit_misc import recursive_dir_contents as _recursive_dir_contents from cdsware.elmsubmit_misc import concat as _concat from cdsware.elmsubmit_misc import mkdir_parents as _mkdir_parents # Store all files written out in two lists: # 1. remove_always is for temporary files, which we try to remove regardless. # 2. remove_on_error is for files the user wants, but need removing if we # encounter an error. def _validate_args(arg, received, allowed): if received not in allowed: raise ValueError('argument %s must be a value from set %s: got %s' % (arg, allowed, received)) _remove_on_error = [] _remove_always = [] def _remember_write(file_loc, error_only=False): if error_only: _remove_on_error.append(file_loc) else: _remove_always.append(file_loc) def _delete_files(list): for item in list: if os.path.isdir(item): shutil.rmtree(item) else: os.unlink(item) def _calc_perms(permissions, umask): return permissions & (~umask) ## os.chmod('/tmp/thisthis', stat.S_IMODE(os.stat('/tmp')[stat.ST_MODE])) def _check_mode(current_mode, allowed_mode): if current_mode != allowed_mode: raise _ModeError _valid_file_types = ['regular', 'dir', 'symlink', 'hardlink', 'char_dev', 'block_dev', 'fifo'] def _file_type(tarinfo_obj): if tarinfo_obj.isfile(): return 'regular' elif tarinfo_obj.isdir(): return 'dir' elif tarinfo_obj.issym(): return 'symlink' elif tarinfo_obj.islnk(): return 'hardlink' elif tarinfo_obj.ischr(): return 'char_dev' elif tarinfo_obj.isblk(): return 'block_dev' elif tarinfo_obj.isfifo(): return 'fifo' def _pick_compression_type(ext): # Fix the extension; for example if its a gzipped pdf, # calculate_filname_extension will return pdf.gz. To combat # this, we find the longest extension from: tar.gz, tar.bz2, # tar, gz, bz2. return re.sub(r'^.*?(tar\.gz|tar\.bz2|tar|gz|bz2)$', r'\1', string=ext, count=1) def _verify_filename(name, seen_filenames, filename_collision, num_random_bits, rename_from_set): # name could be a filename or directory. if seen_filenames.has_key(name): seen_filenames[name] += 1 times = seen_filenames[name] (dirname, basename) = os.path.split(name) if filename_collision == 'throw_error': raise EZArchiveError('filename collision: %s' % (name)) elif filename_collision == 'rename_similar': # Just in case archive contains a list of # filenames that follow the same pattern as this # incrementing, we need to check the increment # doesn't collide as well: incremented_basename = str(times) + '.' + basename while seen_filenames.has_key(os.path.join(dirname, incremented_basename)): times += 1 incremented_basename = str(times) + '.' + basename # Make a note of how many increments we've had to # do: seen_filenames[name] = times name = os.path.join(dirname, incremented_basename) elif filename_collision == 'rename_random': # Just in case of random collision, we introduce the while loop. randbasename = _random_alphanum_string(num_random_bits, chars=rename_from_set) tries = 1 while seen_filenames.has_key(os.path.join(dirname, randbasename)): randbasename = _random_alphanum_string(num_random_bits, chars=rename_from_set) # If user gives small set rename_from_set and low number of bits, # then it is possible we will exhaust all posibile combinations: tries += 1 if tries > 20: raise EZArchiveError('20 random filename selections collided: perhaps you need to increase num_rand_bits?') seen_filenames[os.path.join(dirname, randbasename)] = 0 name = os.path.join(dirname, randbasename) elif filename_collision == 'overwrite': pass elif filename_collision == 'skip': return ['skip'] else: seen_filenames[name] = 0 return name def extract(input, # byte string of file location input_disposition='byte_string', # ['byte_string', 'file_location'] compression_hint=None, # [None, 'gz', 'bz2', 'tar', 'tar.gz', 'tar.bz2', 'zip'] extract_to='byte_strings', # ['byte_strings', 'my_directory', 'temp_directory'] my_directory=None, # directory path backup_extension=None, # extension including dot, for backup of my_directory directory_structure='retain', # ['retain', 'flatten'] file_handle = None, # [None, 'py', 'os'] file_handle_mode = 'rb', force_file_permissions=None, # file permission bits. eg 0777. force_dir_permissions=None, # file permission bits. eg 0777. umask=None, # file permission bits. eg. 0777 (assuming standard umask interpretation). allow_file_types=_valid_file_types, # list containing any of ['regular, dir, symlink, hardlink, char_dev, block_dev, fifo'] on_find_invalid_file_type='throw_error', # ['throw_error', 'skip'] filename_collision='rename_similar', # ['throw_error', 'rename_similar', 'rename_random', 'overwrite', 'skip'] rename_from_set='abcdefghijklmnopqrstuvwxyz', # characters to use if filename_collision='rename_random' num_random_bits=8, # number of random bits to use in the random filename. allow_clobber=False, # [True, False] on_find_dotdot_path='throw_error', # ['throw_error', 'skip', 'allow'] on_find_absolute_path='throw_error' # ['throw_error', 'skip', 'allow'] # Shelved options: # file_name_regexp, non_matches='rename_safely', etc. # Hopefully to be implemented in the future. ): # Clean out the written files list: global _remove_on_error global _remove_always _remove_on_error = [] _remove_always = [] # Validate arguments. _validate_args('input_disposition', input_disposition, ['byte_string', 'file_location']) _validate_args('compression_hint', compression_hint, [None] + available_tools.keys()) _validate_args('extract_to', extract_to, ['byte_strings', 'my_directory', 'temp_directory']) # _validate_args('extract_to', return_objects, [None, 'file_location', 'open_c_filehandles', 'open_py_file_handles']) f = lambda type: _validate_args('allow_file_types', type, _valid_file_types) map(f, allow_file_types) if not input: raise ValueError('argument input must specify a filename or a byte string') # From here on, we start writing things out to disk, so we wrap it # in a try loop and catch all exceptions. This allows us to clean # up the disk if we didn't succeed with the whole of the # extraction. try: # try/except/finally cannot be combined, so we have to nest: try: # Write input to a temp file if we are given a byte string. if input_disposition == 'byte_string': input_file_loc = _write_to_and_return_tempfile_name(input) _remember_write(input_file_loc) else: # input_disposition == 'file_location' # Check that the input file location we've been given exists; # stat will throw the right error for us: os.stat(input) # Check it is a file: if not os.path.isfile(input): raise ValueError("argument input must be a path to an archive file if input_disposition='file_location': %s" % (input)) input_file_loc = input # Make sure we know what type of file we're dealing with: if compression_hint is None: compression_ext = _calculate_filename_extension(filename=input_file_loc) compression_ext = _pick_compression_type(compression_ext) else: compression_ext = compression_hint # Select approriate archive/compression tool: try: tool_class = available_tools[compression_ext] except KeyError: raise EZArchiveError('Unrecognized archive type: %s' % (compression_ext)) # Instantiate the tool: archive = tool_class(input_file_loc, mode='r', allow_clobber=allow_clobber) if extract_to == 'byte_strings': # If extract_to == byte_strings, permissions mean nothing. # However, because we use a temp directory to load the files # into byte strings, we force the permissions to be nice and # liberal inside the temp dir: force_file_permissions = 0700 force_dir_permissions = 0700 # Get extraction_root: if extract_to == 'byte_strings' or extract_to == 'temp_directory': # Need a temp directory to work in. extraction_root = tempfile.mkdtemp() if extract_to == 'byte_strings': _remember_write(extraction_root, error_only=False) else: # extract_to == 'temp_directory': _remember_write(extraction_root, error_only=True) else: # extract_to == 'my_directory': if my_directory is None: raise ValueError("my_directory must be specified if extract_to='my_directory'") # Make given directory into a nice sane one. my_directory = os.path.abspath(os.path.expanduser(os.path.normpath(my_directory))) # Check it exists, and we can stat it: # stat will throw the right error for us: os.stat(my_directory) # Check it is a dir. if not os.path.isdir(my_directory): raise ValueError("argument my_directory must be a directory: %s" % (my_directory)) # If we've been asked to back it up, do so: if backup_extension is not None: backup_dir = my_directory + backup_extension if _backup_directory(my_directory, backup_dir) is not None: raise EZArchiveError('creation of backup directory using GNU mirrordir failed: %s' % (backup_dir)) # Finally set the extraction root: extraction_root = my_directory # Logically we would also check we have write permissions # here. But this is acutally better served by letting # builtin/other functions raise EnvironmentErrors when we fail # to write: Checking for write permissions is actually quite # complex: e.g. you'd have to check group membership to see if # the group bits allow write. # If we haven't been given a umask, use take the system umask as a # default. If we have been given a umask, set the system umask to # it, so all calls to builtin open/file apply the given umask: if umask is None: # It doesn't seem possible to read the umask without also # setting it. Hence this fudge: umask = os.umask(0777) os.umask(umask) # Used in the extraction for loop to check for filename collisions # when flattening directory structure: seen_filenames = {} # Collect the returned file information here: return_data = [] for mem in archive.list_all_members(): name = mem['name'] dir = mem['dir'] file_type = mem['file_type'] identity_object = mem['identity_object'] # Check it is an allowed file type: if file_type not in allow_file_types: if on_find_invalid_file_type=='skip': continue else: # on_find_invalid_file_type='throw_error': raise EZArchiveError("found disallowed file type '%s': %s" % (file_type, os.path.join(dir, name))) # Deal with dotdot paths: if on_find_dotdot_path == 'allow': pass else: # check if path contains '..' dir_parts = dir.split(os.sep) if '..' in dir_parts or name == '..': if on_find_dotdot_path == 'throw_error': raise EZArchiveError("tar entry's path contains '..' (*cautiously* consider on_find_dotdot_path='allow'): " + os.path.join(dir, name)) else: # on_find_dotdot_path == 'skip' # next file please: continue # Deal with absolute paths in a similar way: if on_find_absolute_path == 'allow': pass else: # check if path begins with '/' if dir != '' and dir[0] == '/': if on_find_absolute_path == 'throw_error': raise EZArchiveError("tar entry's path is absolute (*cautiously* consider on_find_absolute_path='allow'): " + os.path.join(dir, name)) else: # on_find_absolute_path == 'skip' # next file please: continue # Deal with flattening of directories: if directory_structure == 'flatten': dir = '' if file_type == 'dir': continue # tars allow multiple entries for same path/file: # extracting such tarballs with GNU/tar will just # cause the second entry to overwrite the first. We # try to be more graceful: verified_fullname = _verify_filename(name=os.path.join(dir, name), seen_filenames=seen_filenames, filename_collision=filename_collision, num_random_bits=num_random_bits, rename_from_set=rename_from_set) if verified_fullname == ['skip']: continue name = os.path.basename(verified_fullname) archive.extract_member(identity_object=identity_object, root_dir=extraction_root, dir=dir, new_filename=name, umask=umask, force_file_permissions=force_file_permissions, force_dir_permissions=force_dir_permissions, allow_clobber=allow_clobber) fullname = os.path.join(extraction_root, dir, name) file_info = {} file_info['basename'] = name file_info['tar_dir'] = dir file_info['file_type'] = file_type if extract_to == 'byte_strings': if file_type == 'regular': file_info['file'] = open(fullname, 'rb').read() else: # extract_to in ['my_directory', 'temp_directory'] file_info['fullname'] = fullname file_info['dirname'] = os.path.join(extraction_root, dir) if file_type == 'regular': if file_handle == 'py': file_info['fh'] = open(fullname, file_handle_mode) elif file_handle == 'os': file_info['fh'] = os.open(fullname, file_handle_mode) return_data.append(file_info) if extract_to == 'temp_directory': return (extraction_root, return_data) else: return return_data except: # Clean up non-temporary file if we get an error: _delete_files(_remove_on_error) raise finally: # Always clean up temporary files, error or not: _delete_files(_remove_always) def create(input, # list of files or named ([['name', 'data...'], ...]) or anonymous ([[data...], ...]) byte strings. input_disposition='named_byte_strings', # ['file_locations', 'anonymous_byte_strings', 'named_byte_strings'] compression='tar.gz', # ['gz', 'bz2', 'tar', 'tar.gz', 'tar.bz2', 'zip'] compress_to = 'byte_string', # ['byte_string', 'my_file', 'temp_file'] my_file=None, # name of output archive, if compress_to='my_file' recurse_dirs=True, # [True, False] directory_structure='retain', # ['retain', 'flatten'] use_compression_root='calculate_minimum', # ['calculate_minimum', 'this_root'] this_root=None, # root path for compression of files. filename_collision='rename_similar', # ['throw_error', 'rename_similar', 'rename_random', 'overwrite', 'skip'] rename_from_set='abcdefghijklmnopqrstuvwxyz', # characters to use if filename_collision='rename_random' num_random_bits=8, # number of random bits to use in the random filename. force_file_permissions=None, # file permission bits. eg 0777. force_dir_permissions=None, # file permission bits. eg 0777. file_handle = None, # [None, 'py', 'os'] file_handle_mode = 'rb', allow_clobber=False, # [True, False] ): # Basic idea: If we are told to output an archive (tar or zip) # then all files given in input are put into a single archive. If # we are told to output compressed files (gz, bz2) then we must be # given a maximum of one archive file. # If we are given anonymous byte strings with no filename, we use # filename_generator.generate_filename() to provide a random # filename with hopefully a correct file extension. # Clean out written files list: global _remove_on_error global _remove_always _remove_on_error = [] _remove_always = [] # Validate arguments. # ?????????????????? # From here on, we start writing things out to disk, so we wrap it # in a try loop and catch all exceptions. This allows us to clean # up the disk if we didn't succeed with the whole of the # extraction. try: # try/except/finally cannot be combined, so we have to nest: try: # Write input to a temp file if we are given a byte string. # Work out where the output archive file is going to go: if compress_to == 'my_file': if my_file is None: raise ValueError("if compress_to == 'my_file' then argument my_file must be specified. got None.") # Make given file into a nice sane one: archive_fullname = os.path.abspath(os.path.expanduser(os.path.normpath(my_file))) # Should we remember this file or not? If we get an error in # the middle of processing, should we delete a user specified # archive file? The decision is not so clear cut as with # temporary files (see next). My choice is not to remember # (and so not to delete on error) else: # compress_to in ['temp_file', 'byte_string'] (tf, tf_name) = _open_tempfile(mode='wb') # close filehandle because we don't need it: tf.close() # delete the empty tempfile that open_tempfile # created, so we don't get ClobberError os.unlink(tf_name) del tf if compress_to == 'temp_file': _remember_write(tf_name, error_only=True) else: # compress_to == 'byte_string' _remember_write(tf_name, error_only=False) archive_fullname = tf_name # Get an archive/compress tool: tool_class = available_tools[compression] archive = tool_class(file_loc=archive_fullname, mode='w', allow_clobber=allow_clobber) # Deal with the input: # We do this as follows: # 1. Take anonymous byte strings and turn them into byte strings # by generating a filename for each string, then set # input=[new list of named byte strings] # input_disposition='named_byte_strings' # 2. Take named byte strings and write them to a temporary # directory, chdir to this directory and set: # input = [glob of temp dir] # input_diposition = 'file_locations' if input_disposition == 'anonymous_byte_strings': # If input is anonymous byte strings, we need generate a filename # for each of the strings: seen_rand_names = [] def f(bytstr): rand_name = _random_alphanum_string(num_random_bits, chars=rename_from_set) tries = 1 while rand_name in seen_rand_names: rand_name = _random_alphanum_string(num_random_bits, chars=rename_from_set) tries += 1 if tries > 20: raise EZArchiveError('20 random filename selections collided: perhaps you need to increase num_rand_bits?') seen_rand_names.append(rand_name) return [rand_name, bytstr] input = map(f, input) input_disposition = 'named_byte_strings' if input_disposition == 'named_byte_strings': # Write the byte strings out to the temporary directory. temp_dir = tempfile.mkdtemp() _remember_write(temp_dir, error_only=False) if this_root is not None: # santize: this_root = os.path.abspath(os.path.expanduser(os.path.normpath(this_root))) # chop off the root slashes: this_root = re.sub(r'^/+', '', string=this_root, count=1) # rejig the root dir to reflect the fact we've shoved # everything under a psuedo-root temp directory: this_root = os.path.join(temp_dir, this_root) new_input = [] seen_filenames = {} for filename, bytestr in input: # Sanitize the filename we've been given: filename = os.path.abspath(os.path.expanduser(os.path.normpath(filename))) # chop off the root slashes: filename = re.sub(r'^/+', '', string=filename, count=1) dirname = os.path.dirname(filename) # Use temp_dir as a 'fake_root': (There is some possible # dodginess here if the user names one of the files as if # it were inside the not yet existant temp directory: # unlikely scenario; should we work around it? I haven't. _mkdir_parents(os.path.join(temp_dir, dirname)) filename = _verify_filename(name=filename, seen_filenames=seen_filenames, filename_collision=filename_collision, num_random_bits=num_random_bits, rename_from_set=rename_from_set) if filename == ['skip']: continue tempfile_fullname = os.path.join(temp_dir, filename) open(tempfile_fullname, 'wb').write(bytestr) new_input.append(tempfile_fullname) input = new_input input_disposition='file_locations' # At this point, input_disposition='file_locations' and input contains a list of filenames. # sanitize the list of filenames f = lambda x: os.path.abspath(os.path.expanduser(os.path.normpath(x))) input = map(f, input) # Expand any directories into filenames (excluding symlinks): new_input = [] for item in input: if os.path.isdir(item): new_input.append(item) if recurse_dirs: new_input.extend(_recursive_dir_contents(item)) else: new_input.append(item) input = new_input # calculate the compression root: if use_compression_root == 'calculate_minimum': first_input = input[0] if input == filter(lambda x: x == first_input, input): # all of the filenames we've been given are the same: compression_root = os.path.dirname(first_input) files_to_compress = [os.path.basename(first_input)] * len(input) else: # find out the common root of the filenames: (compression_root, files_to_compress) = _split_common_path(input) # if compression_root was also specified in input, it will # have become a blank entry '' in files_to_compress: files_to_compress = filter(lambda x: (x != '' and True) or False, files_to_compress) else: # use_compression_root == 'this_root': if this_root is None: raise EZArchiveError("if compression_root=='this_root' then argument this_root must be specified") this_root = os.path.abspath(os.path.expanduser(os.path.normpath(this_root))) # check that this_root is indeed a prefix of all of the input # files we've been given: if input != filter(lambda file: this_root in _dirtree(file), input): raise EZArchiveError('not all files specified in argument input are children of argument this_root') # get rid of the entries that are exactly this_root: input = filter(lambda file: file != this_root, input) compression_root = this_root # Chop off this_root from input: if this_root == '/' or this_root == '//': this_root_len = len(this_root) else: this_root_len = len(this_root + '/') files_to_compress = map(lambda file: file[this_root_len:], input) old_cwd = os.getcwd() os.chdir(compression_root) seen_filenames = {} for file_to_compress in files_to_compress: if directory_structure == 'flatten': if os.path.isdir(file_to_compress): continue archive_name = os.path.basename(file_to_compress) archive_name = _verify_filename(name=archive_name, seen_filenames=seen_filenames, filename_collision=filename_collision, num_random_bits=num_random_bits, rename_from_set=rename_from_set) if archive_name == ['skip']: continue archive.add_member(file_loc=file_to_compress, archive_name=archive_name, force_file_permissions=force_file_permissions, force_dir_permissions=force_dir_permissions) else: # directory_structure == 'retain': archive.add_member(file_loc=file_to_compress, archive_name=None, force_file_permissions=force_file_permissions, force_dir_permissions=force_dir_permissions) # get rid of the archive object, which has an open # filehandle, mode 'wb' on the archive file: # not closing this would prevent us from seeing what # has been written to the files. del archive # now see if we need to return anything: if compress_to == 'my_file': return None elif compress_to == 'temp_file': return tf_name else: # compress_to == 'byte_string': return open(archive_fullname, 'rb').read() except: # Clean up non-temporary file if we get an error: _delete_files(_remove_on_error) raise finally: # Always clean up temporary files, error or not: _delete_files(_remove_always) try: os.chdir(old_cwd) except: pass class ArchiveTool: def __init__(self, file_loc, mode, allow_clobber=False): raise Exception("method must be overided in child class") def list_all_members(self): raise Exception("method must be overided in child class") # Should return dictionary: # { filename = # tar_location = # new_location = # file_type = # } def extract_member(self, identity_object, root_dir, dir, new_filename, umask, force_file_permissions=None, force_dir_permissions=None, allow_clobber=False): raise Exception("method must be overided in child class") def add_member(self, file_loc, archive_name=None, force_file_permissions=None, force_dir_permissions=None): raise Exception("method must be overided in child class") class tarArchiveTool(ArchiveTool): # Overide this in child classes tarbz2ArchiveTool and # targzArchiveTool to make the mode string reflect the required # compression. def _mode_string(string): return string + ':' _mode_string = staticmethod(_mode_string) def __init__(self, file_loc, mode, allow_clobber=False): if mode not in ('r', 'w'): raise ValueError('mode argument must equal "r" or "w"') if mode == 'w': if os.path.exists(file_loc) and not allow_clobber: raise ClobberError(file_loc) # Set adjusted mode to reflect whether we are dealing with a # tar.gz tar.bz2 or just a tar. adjusted_mode = self._mode_string(mode) self._tarfile_obj = tarfile.open(name=file_loc, mode=adjusted_mode) self._tarfile_obj.errorlevel=2 self._mode = mode self._filename = os.path.basename(file_loc) self._file_loc = file_loc def list_all_members(self): _check_mode(self._mode, 'r') f = lambda tarinfo_obj: { 'name' : os.path.basename(os.path.normpath(tarinfo_obj.name)), 'dir' : os.path.dirname(os.path.normpath(tarinfo_obj.name)), 'file_type' : _file_type(tarinfo_obj), 'identity_object' : tarinfo_obj } return map(f, self._tarfile_obj.getmembers()) def extract_member(self, identity_object, root_dir, dir, new_filename, umask, force_file_permissions=None, force_dir_permissions=None, allow_clobber=False): _check_mode(self._mode, 'r') tarinfo_obj = identity_object output_location = os.path.join(root_dir, dir, new_filename) if os.path.exists(output_location) and not allow_clobber: raise ClobberError(output_location) # Extract the file to the given location. saved_name = tarinfo_obj.name tarinfo_obj.name = os.path.join(dir, new_filename) saved_mode = tarinfo_obj.mode tarinfo_obj.mode = _calc_perms(tarinfo_obj.mode, umask) # Apply umask to permissions. try: self._tarfile_obj.extract(tarinfo_obj, root_dir) except EnvironmentError, e: if e.errno == 13: def f(): # Have already done this, but permissions might # have caused a fallacious answer previously: if os.path.exists(output_location) and not allow_clobber: raise ClobberError(output_location) elif os.path.exists(output_location) and allow_clobber: if os.path.isdir(output_location): # can ignore dirs; we can overwrite them # whatever their current perms pass else: # non-write permissions will prevent # .extract method from overwriting, so # unlink first: os.unlink(output_location) return self._tarfile_obj.extract(tarinfo_obj, root_dir) number_dotdot = _count_dotdot(dir) if number_dotdot != 0: # This is the reason why allow_dotdot_paths = True is v. dangerous: barrier_dir = None # shunted_root_dir = os.path.join(root_dir, '../' * number_dotdot) # normed_shunted_root_dir = os.path.normpath(shunted_root_dir) # barrier_dir = normed_shunted_root_dir else: barrier_dir=root_dir _provide_dir_with_perms_then_exec(dir=os.path.join(root_dir, dir), function=f, perms=0700, barrier_dir=barrier_dir) else: raise tarinfo_obj.name = saved_name tarinfo_obj.mode = saved_mode # If we've been asked to force permissions, do so: type = _file_type(tarinfo_obj) if type == 'regular': if force_file_permissions is not None: try: os.chmod(output_location, force_file_permissions) except EnvironmentError, e: if e.errno == 13: f = lambda: os.chmod(output_location, force_file_permissions) _provide_dir_with_perms_then_exec(dir=os.path.join(root_dir, dir), function=f, perms=0700, barrier_dir=root_dir) else: raise elif type == 'dir': if force_dir_permissions is not None: try: os.chmod(output_location, force_dir_permissions) except EnvironmentError, e: if e.errno == 13: f = lambda: os.chmod(output_location, force_dir_permissions) _provide_dir_with_perms_then_exec(dir=os.path.join(root_dir, dir), function=f, perms=0700, barrier_dir=root_dir) else: raise else: # We don't attempt to play with permissions of special # file types. pass def add_member(self, file_loc, archive_name=None, force_file_permissions=None, force_dir_permissions=None): _check_mode(self._mode, 'w') if archive_name is None: archive_name = file_loc tarinfo_obj = self._tarfile_obj.gettarinfo(name=file_loc, arcname=archive_name) if tarinfo_obj is None: if WARN_SKIP: sys.stderr.write("Skipping unsupported file type (eg. socket): %s\n" % (file_loc)) return None if os.path.isdir(file_loc) and force_dir_permissions is not None: tarinfo_obj.mode = force_dir_permissions if os.path.isfile(file_loc) and force_file_permissions is not None: tarinfo_obj.mode = force_file_permissions if tarinfo_obj.isfile(): self._tarfile_obj.addfile(tarinfo_obj, open(file_loc, 'rb')) else: self._tarfile_obj.addfile(tarinfo_obj) class targzArchiveTool(tarArchiveTool): def _mode_string(string): return string + ':gz' _mode_string = staticmethod(_mode_string) class tarbz2ArchiveTool(tarArchiveTool): def _mode_string(string): return string + ':bz2' _mode_string = staticmethod(_mode_string) class zipArchiveTool(ArchiveTool): pass class CompressTool: # Use to prevent trying to compress multiple files into the # unstructured gz file (if you want to do this, use a tar.gz, # tar.bz2, zip instead!): _write_protected = False def __init__(self, file_loc, mode, allow_clobber=False): """ Overided child methods must set class properties: self._fh self._filename self._file_loc self._mode """ raise Exception("method must be overided in child class") def list_all_members(self): _check_mode(self._mode, 'r') uncompressed_filename = re.sub(r'\.' + self._ext + r'$', '', string=self._filename, count=1) return [{ 'name' : uncompressed_filename, 'dir' : '', 'file_type' : 'regular', 'identity_object' : None } ] def extract_member(self, identity_object, root_dir, dir, new_filename, umask, force_file_permissions=None, force_dir_permissions=None, allow_clobber=False): _check_mode(self._mode, 'r') output_location = os.path.join(root_dir, dir, new_filename) if os.path.exists(output_location) and not allow_clobber: raise ClobberError(output_location) elif os.path.exists(output_location) and allow_clobber: # unlink instead of just overwriting: this makes sure the # file permissions take the umask into account: os.unlink(output_location) output_fh = open(output_location, 'wb') output_fh.write(self._fh.read()) output_fh.close() # See if we need to force the file permissions. Otherwise, we # do nothing, since open call above will have obeyed the # system umask. if force_file_permissions is not None: os.chmod(output_location, force_file_permissions) def add_member(self, file_loc, archive_name=None, force_file_permissions=None, force_dir_permissions=None): if not os.path.isfile(file_loc): raise EZArchiveError("%s file format only supports compression of regular files: %s" % (self._ext, file_loc)) if not self._write_protected: input_fh = open(file_loc, 'rb') self._fh.write(input_fh.read()) input_fh.close() self._fh.close() self._write_protected = True else: raise EZArchiveError('tried to compress more than one file into a single %s file' % (self._ext)) class gzCompressTool(CompressTool): def __init__(self, file_loc, mode, allow_clobber=False): if mode not in ('r', 'w'): raise ValueError('mode argument must equal "r" or "w"') if mode == 'w': if os.path.exists(file_loc) and not allow_clobber: raise ClobberError(file_loc) self._fh = gzip.GzipFile(file_loc, mode=mode+'b') self._filename = os.path.basename(file_loc) self._file_loc = file_loc self._mode = mode self._ext = 'gz' class bz2CompressTool(CompressTool): def __init__(self, file_loc, mode, allow_clobber=False): if mode not in ('r', 'w'): raise ValueError('mode argument must equal "r" or "w"') if mode == 'w': if os.path.exists(file_loc) and not allow_clobber: raise ClobberError(file_loc) self._fh = bz2.BZ2File(file_loc, mode=mode+'b') self._filename = os.path.basename(file_loc) self._file_loc = file_loc self._mode = mode self._ext = 'bz2' available_tools = { 'tar' : tarArchiveTool, 'tar.gz' : targzArchiveTool, 'tar.bz2' : tarbz2ArchiveTool, 'zip' : zipArchiveTool, 'gz' : gzCompressTool, 'bz2' : bz2CompressTool } # Errors: class _ModeError(Exception): """ This is a private error raised iff there is an attempt to use a class method that is not allowed by the 'mode' in which the class instance has been instantiated. Eg. If we have created a CompressTool in write mode, and we try to use a method intended only for use in read mode. This should only occur in the case of a programming error in the module. """ pass class _NotInArchive(Exception): """ A private error raised iff there is an attempt to extract a file from a given archive that does not exist inside the archive. This should only occur in the case of a programming error in the module. """ pass class EZArchiveError(Exception): pass class ClobberError(EZArchiveError): pass def tester(tar): t = targzArchiveTool(tar, mode='r', allow_clobber=False) for mem in t.list_all_members(): name = mem['name'] dir = mem['dir'] identity_object = mem['identity_object'] t.extract_member(identity_object=identity_object, root_dir='/tmp', dir=dir, new_filename=name, umask=0002, force_file_permissions=None, force_dir_permissions=None, allow_clobber=False) def tester2(file): tar = tarfile.open(file, mode="r:gz") for tarinfo in tar: tar.extract(tarinfo, '/tmp/') tar.close() - - diff --git a/modules/elmsubmit/lib/elmsubmit_EZArchive.py.wml b/modules/elmsubmit/lib/elmsubmit_EZArchive.py.wml deleted file mode 100644 index 88e9decd2..000000000 --- a/modules/elmsubmit/lib/elmsubmit_EZArchive.py.wml +++ /dev/null @@ -1,1040 +0,0 @@ -# -*- coding: utf-8 -*- - -## $Id$ - -## This file is part of the CERN Document Server Software (CDSware). -## Copyright (C) 2002, 2003, 2004, 2005 CERN. -## -## The CDSware is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License as -## published by the Free Software Foundation; either version 2 of the -## License, or (at your option) any later version. -## -## The CDSware is distributed in the hope that it will be useful, but -## WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -## General Public License for more details. -## -## You should have received a copy of the GNU General Public License -## along with CDSware; if not, write to the Free Software Foundation, Inc., -## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - - -import gzip -import bz2 -import zipfile -import tarfile -import shutil -import os -import copy -import re -import tempfile -import glob -import sys - -WARN_SKIP = True - -from cdsware.elmsubmit_filename_generator import calculate_filename_extension as _calculate_filename_extension -# from cdsware.elmsubmit_filename_generator import generate_filename as _generate_filename -from cdsware.elmsubmit_misc import write_to_and_return_tempfile_name as _write_to_and_return_tempfile_name -from cdsware.elmsubmit_misc import provide_dir_with_perms_then_exec as _provide_dir_with_perms_then_exec -from cdsware.elmsubmit_misc import dirtree as _dirtree -from cdsware.elmsubmit_misc import count_dotdot as _count_dotdot -from cdsware.elmsubmit_misc import get_perms as _get_perms -from cdsware.elmsubmit_misc import random_alphanum_string as _random_alphanum_string -from cdsware.elmsubmit_misc import backup_directory as _backup_directory -from cdsware.elmsubmit_misc import open_tempfile as _open_tempfile -from cdsware.elmsubmit_misc import split_common_path as _split_common_path -from cdsware.elmsubmit_misc import recursive_dir_contents as _recursive_dir_contents -from cdsware.elmsubmit_misc import concat as _concat -from cdsware.elmsubmit_misc import mkdir_parents as _mkdir_parents - -# Store all files written out in two lists: -# 1. remove_always is for temporary files, which we try to remove regardless. -# 2. remove_on_error is for files the user wants, but need removing if we -# encounter an error. - -def _validate_args(arg, received, allowed): - if received not in allowed: - raise ValueError('argument %s must be a value from set %s: got %s' % (arg, allowed, received)) - -_remove_on_error = [] -_remove_always = [] - -def _remember_write(file_loc, error_only=False): - if error_only: - _remove_on_error.append(file_loc) - else: - _remove_always.append(file_loc) - -def _delete_files(list): - for item in list: - if os.path.isdir(item): - shutil.rmtree(item) - else: - os.unlink(item) - -def _calc_perms(permissions, umask): - return permissions & (~umask) - -## os.chmod('/tmp/thisthis', stat.S_IMODE(os.stat('/tmp')[stat.ST_MODE])) - -def _check_mode(current_mode, allowed_mode): - if current_mode != allowed_mode: raise _ModeError - - -_valid_file_types = ['regular', 'dir', 'symlink', 'hardlink', 'char_dev', 'block_dev', 'fifo'] - -def _file_type(tarinfo_obj): - if tarinfo_obj.isfile(): - return 'regular' - elif tarinfo_obj.isdir(): - return 'dir' - elif tarinfo_obj.issym(): - return 'symlink' - elif tarinfo_obj.islnk(): - return 'hardlink' - elif tarinfo_obj.ischr(): - return 'char_dev' - elif tarinfo_obj.isblk(): - return 'block_dev' - elif tarinfo_obj.isfifo(): - return 'fifo' - -def _pick_compression_type(ext): - # Fix the extension; for example if its a gzipped pdf, - # calculate_filname_extension will return pdf.gz. To combat - # this, we find the longest extension from: tar.gz, tar.bz2, - # tar, gz, bz2. - return re.sub(r'^.*?(tar\.gz|tar\.bz2|tar|gz|bz2)$', r'\1', string=ext, count=1) - -def _verify_filename(name, seen_filenames, filename_collision, num_random_bits, rename_from_set): - - # name could be a filename or directory. - - if seen_filenames.has_key(name): - seen_filenames[name] += 1 - times = seen_filenames[name] - - (dirname, basename) = os.path.split(name) - - if filename_collision == 'throw_error': - raise EZArchiveError('filename collision: %s' % (name)) - elif filename_collision == 'rename_similar': - - # Just in case archive contains a list of - # filenames that follow the same pattern as this - # incrementing, we need to check the increment - # doesn't collide as well: - incremented_basename = str(times) + '.' + basename - while seen_filenames.has_key(os.path.join(dirname, incremented_basename)): - times += 1 - incremented_basename = str(times) + '.' + basename - - # Make a note of how many increments we've had to - # do: - seen_filenames[name] = times - name = os.path.join(dirname, incremented_basename) - - elif filename_collision == 'rename_random': - # Just in case of random collision, we introduce the while loop. - randbasename = _random_alphanum_string(num_random_bits, chars=rename_from_set) - tries = 1 - while seen_filenames.has_key(os.path.join(dirname, randbasename)): - randbasename = _random_alphanum_string(num_random_bits, chars=rename_from_set) - # If user gives small set rename_from_set and low number of bits, - # then it is possible we will exhaust all posibile combinations: - tries += 1 - if tries > 20: - raise EZArchiveError('20 random filename selections collided: perhaps you need to increase num_rand_bits?') - seen_filenames[os.path.join(dirname, randbasename)] = 0 - name = os.path.join(dirname, randbasename) - elif filename_collision == 'overwrite': - pass - elif filename_collision == 'skip': - return ['skip'] - else: - seen_filenames[name] = 0 - - return name - -def extract(input, # byte string of file location - input_disposition='byte_string', # ['byte_string', 'file_location'] - compression_hint=None, # [None, 'gz', 'bz2', 'tar', 'tar.gz', 'tar.bz2', 'zip'] - extract_to='byte_strings', # ['byte_strings', 'my_directory', 'temp_directory'] - - my_directory=None, # directory path - backup_extension=None, # extension including dot, for backup of my_directory - directory_structure='retain', # ['retain', 'flatten'] - - file_handle = None, # [None, 'py', 'os'] - file_handle_mode = 'rb', - - force_file_permissions=None, # file permission bits. eg 0777. - force_dir_permissions=None, # file permission bits. eg 0777. - umask=None, # file permission bits. eg. 0777 (assuming standard umask interpretation). - - allow_file_types=_valid_file_types, # list containing any of ['regular, dir, symlink, hardlink, char_dev, block_dev, fifo'] - on_find_invalid_file_type='throw_error', # ['throw_error', 'skip'] - - filename_collision='rename_similar', # ['throw_error', 'rename_similar', 'rename_random', 'overwrite', 'skip'] - rename_from_set='abcdefghijklmnopqrstuvwxyz', # characters to use if filename_collision='rename_random' - num_random_bits=8, # number of random bits to use in the random filename. - - allow_clobber=False, # [True, False] - - on_find_dotdot_path='throw_error', # ['throw_error', 'skip', 'allow'] - on_find_absolute_path='throw_error' # ['throw_error', 'skip', 'allow'] - - # Shelved options: - # file_name_regexp, non_matches='rename_safely', etc. - # Hopefully to be implemented in the future. - ): - - # Clean out the written files list: - global _remove_on_error - global _remove_always - _remove_on_error = [] - _remove_always = [] - - # Validate arguments. - _validate_args('input_disposition', input_disposition, ['byte_string', 'file_location']) - _validate_args('compression_hint', compression_hint, [None] + available_tools.keys()) - _validate_args('extract_to', extract_to, ['byte_strings', 'my_directory', 'temp_directory']) - # _validate_args('extract_to', return_objects, [None, 'file_location', 'open_c_filehandles', 'open_py_file_handles']) - f = lambda type: _validate_args('allow_file_types', type, _valid_file_types) - map(f, allow_file_types) - if not input: raise ValueError('argument input must specify a filename or a byte string') - - # From here on, we start writing things out to disk, so we wrap it - # in a try loop and catch all exceptions. This allows us to clean - # up the disk if we didn't succeed with the whole of the - # extraction. - - try: - # try/except/finally cannot be combined, so we have to nest: - try: - # Write input to a temp file if we are given a byte string. - if input_disposition == 'byte_string': - input_file_loc = _write_to_and_return_tempfile_name(input) - _remember_write(input_file_loc) - else: - # input_disposition == 'file_location' - # Check that the input file location we've been given exists; - # stat will throw the right error for us: - os.stat(input) - - # Check it is a file: - if not os.path.isfile(input): - raise ValueError("argument input must be a path to an archive file if input_disposition='file_location': %s" - % (input)) - input_file_loc = input - - # Make sure we know what type of file we're dealing with: - if compression_hint is None: - compression_ext = _calculate_filename_extension(filename=input_file_loc) - compression_ext = _pick_compression_type(compression_ext) - else: - compression_ext = compression_hint - - # Select approriate archive/compression tool: - try: - tool_class = available_tools[compression_ext] - except KeyError: - raise EZArchiveError('Unrecognized archive type: %s' % (compression_ext)) - - # Instantiate the tool: - archive = tool_class(input_file_loc, mode='r', allow_clobber=allow_clobber) - - if extract_to == 'byte_strings': - # If extract_to == byte_strings, permissions mean nothing. - # However, because we use a temp directory to load the files - # into byte strings, we force the permissions to be nice and - # liberal inside the temp dir: - force_file_permissions = 0700 - force_dir_permissions = 0700 - - # Get extraction_root: - if extract_to == 'byte_strings' or extract_to == 'temp_directory': - # Need a temp directory to work in. - extraction_root = tempfile.mkdtemp() - - if extract_to == 'byte_strings': - _remember_write(extraction_root, error_only=False) - else: - # extract_to == 'temp_directory': - _remember_write(extraction_root, error_only=True) - else: - # extract_to == 'my_directory': - - if my_directory is None: - raise ValueError("my_directory must be specified if extract_to='my_directory'") - - # Make given directory into a nice sane one. - my_directory = os.path.abspath(os.path.expanduser(os.path.normpath(my_directory))) - - # Check it exists, and we can stat it: - # stat will throw the right error for us: - os.stat(my_directory) - - # Check it is a dir. - if not os.path.isdir(my_directory): - raise ValueError("argument my_directory must be a directory: %s" % (my_directory)) - - # If we've been asked to back it up, do so: - if backup_extension is not None: - backup_dir = my_directory + backup_extension - if _backup_directory(my_directory, backup_dir) is not None: - raise EZArchiveError('creation of backup directory using GNU mirrordir failed: %s' % (backup_dir)) - - # Finally set the extraction root: - extraction_root = my_directory - - # Logically we would also check we have write permissions - # here. But this is acutally better served by letting - # builtin/other functions raise EnvironmentErrors when we fail - # to write: Checking for write permissions is actually quite - # complex: e.g. you'd have to check group membership to see if - # the group bits allow write. - - # If we haven't been given a umask, use take the system umask as a - # default. If we have been given a umask, set the system umask to - # it, so all calls to builtin open/file apply the given umask: - if umask is None: - # It doesn't seem possible to read the umask without also - # setting it. Hence this fudge: - umask = os.umask(0777) - os.umask(umask) - - # Used in the extraction for loop to check for filename collisions - # when flattening directory structure: - seen_filenames = {} - - # Collect the returned file information here: - return_data = [] - - for mem in archive.list_all_members(): - name = mem['name'] - dir = mem['dir'] - file_type = mem['file_type'] - identity_object = mem['identity_object'] - - # Check it is an allowed file type: - if file_type not in allow_file_types: - if on_find_invalid_file_type=='skip': - continue - else: - # on_find_invalid_file_type='throw_error': - raise EZArchiveError("found disallowed file type '%s': %s" % (file_type, os.path.join(dir, name))) - - # Deal with dotdot paths: - if on_find_dotdot_path == 'allow': - pass - else: - # check if path contains '..' - dir_parts = dir.split(os.sep) - if '..' in dir_parts or name == '..': - if on_find_dotdot_path == 'throw_error': - raise EZArchiveError("tar entry's path contains '..' (*cautiously* consider on_find_dotdot_path='allow'): " - + os.path.join(dir, name)) - else: - # on_find_dotdot_path == 'skip' - # next file please: - continue - - # Deal with absolute paths in a similar way: - if on_find_absolute_path == 'allow': - pass - else: - # check if path begins with '/' - if dir != '' and dir[0] == '/': - if on_find_absolute_path == 'throw_error': - raise EZArchiveError("tar entry's path is absolute (*cautiously* consider on_find_absolute_path='allow'): " - + os.path.join(dir, name)) - else: - # on_find_absolute_path == 'skip' - # next file please: - continue - - # Deal with flattening of directories: - if directory_structure == 'flatten': - dir = '' - - if file_type == 'dir': - continue - - # tars allow multiple entries for same path/file: - # extracting such tarballs with GNU/tar will just - # cause the second entry to overwrite the first. We - # try to be more graceful: - - verified_fullname = _verify_filename(name=os.path.join(dir, name), seen_filenames=seen_filenames, - filename_collision=filename_collision, num_random_bits=num_random_bits, - rename_from_set=rename_from_set) - - if verified_fullname == ['skip']: continue - name = os.path.basename(verified_fullname) - - archive.extract_member(identity_object=identity_object, root_dir=extraction_root, dir=dir, new_filename=name, - umask=umask, force_file_permissions=force_file_permissions, force_dir_permissions=force_dir_permissions, - allow_clobber=allow_clobber) - - fullname = os.path.join(extraction_root, dir, name) - - file_info = {} - file_info['basename'] = name - file_info['tar_dir'] = dir - file_info['file_type'] = file_type - - if extract_to == 'byte_strings': - if file_type == 'regular': - file_info['file'] = open(fullname, 'rb').read() - else: - # extract_to in ['my_directory', 'temp_directory'] - file_info['fullname'] = fullname - file_info['dirname'] = os.path.join(extraction_root, dir) - - if file_type == 'regular': - if file_handle == 'py': - file_info['fh'] = open(fullname, file_handle_mode) - elif file_handle == 'os': - file_info['fh'] = os.open(fullname, file_handle_mode) - - return_data.append(file_info) - - if extract_to == 'temp_directory': - return (extraction_root, return_data) - else: - return return_data - - except: - # Clean up non-temporary file if we get an error: - _delete_files(_remove_on_error) - raise - finally: - # Always clean up temporary files, error or not: - _delete_files(_remove_always) - -def create(input, # list of files or named ([['name', 'data...'], ...]) or anonymous ([[data...], ...]) byte strings. - input_disposition='named_byte_strings', # ['file_locations', 'anonymous_byte_strings', 'named_byte_strings'] - - compression='tar.gz', # ['gz', 'bz2', 'tar', 'tar.gz', 'tar.bz2', 'zip'] - - compress_to = 'byte_string', # ['byte_string', 'my_file', 'temp_file'] - my_file=None, # name of output archive, if compress_to='my_file' - recurse_dirs=True, # [True, False] - - directory_structure='retain', # ['retain', 'flatten'] - use_compression_root='calculate_minimum', # ['calculate_minimum', 'this_root'] - this_root=None, # root path for compression of files. - - filename_collision='rename_similar', # ['throw_error', 'rename_similar', 'rename_random', 'overwrite', 'skip'] - rename_from_set='abcdefghijklmnopqrstuvwxyz', # characters to use if filename_collision='rename_random' - num_random_bits=8, # number of random bits to use in the random filename. - - force_file_permissions=None, # file permission bits. eg 0777. - force_dir_permissions=None, # file permission bits. eg 0777. - - file_handle = None, # [None, 'py', 'os'] - file_handle_mode = 'rb', - - allow_clobber=False, # [True, False] - ): - - # Basic idea: If we are told to output an archive (tar or zip) - # then all files given in input are put into a single archive. If - # we are told to output compressed files (gz, bz2) then we must be - # given a maximum of one archive file. - - # If we are given anonymous byte strings with no filename, we use - # filename_generator.generate_filename() to provide a random - # filename with hopefully a correct file extension. - - # Clean out written files list: - global _remove_on_error - global _remove_always - _remove_on_error = [] - _remove_always = [] - - # Validate arguments. - # ?????????????????? - - # From here on, we start writing things out to disk, so we wrap it - # in a try loop and catch all exceptions. This allows us to clean - # up the disk if we didn't succeed with the whole of the - # extraction. - - try: - # try/except/finally cannot be combined, so we have to nest: - try: - # Write input to a temp file if we are given a byte string. - - # Work out where the output archive file is going to go: - if compress_to == 'my_file': - if my_file is None: - raise ValueError("if compress_to == 'my_file' then argument my_file must be specified. got None.") - - # Make given file into a nice sane one: - archive_fullname = os.path.abspath(os.path.expanduser(os.path.normpath(my_file))) - - # Should we remember this file or not? If we get an error in - # the middle of processing, should we delete a user specified - # archive file? The decision is not so clear cut as with - # temporary files (see next). My choice is not to remember - # (and so not to delete on error) - - else: - # compress_to in ['temp_file', 'byte_string'] - (tf, tf_name) = _open_tempfile(mode='wb') - - # close filehandle because we don't need it: - tf.close() - - # delete the empty tempfile that open_tempfile - # created, so we don't get ClobberError - os.unlink(tf_name) - del tf - - if compress_to == 'temp_file': - _remember_write(tf_name, error_only=True) - else: - # compress_to == 'byte_string' - _remember_write(tf_name, error_only=False) - - archive_fullname = tf_name - - # Get an archive/compress tool: - tool_class = available_tools[compression] - archive = tool_class(file_loc=archive_fullname, mode='w', allow_clobber=allow_clobber) - - # Deal with the input: - # We do this as follows: - - # 1. Take anonymous byte strings and turn them into byte strings - # by generating a filename for each string, then set - # input=[new list of named byte strings] - # input_disposition='named_byte_strings' - - # 2. Take named byte strings and write them to a temporary - # directory, chdir to this directory and set: - # input = [glob of temp dir] - # input_diposition = 'file_locations' - - if input_disposition == 'anonymous_byte_strings': - # If input is anonymous byte strings, we need generate a filename - # for each of the strings: - seen_rand_names = [] - - def f(bytstr): - rand_name = _random_alphanum_string(num_random_bits, chars=rename_from_set) - tries = 1 - while rand_name in seen_rand_names: - rand_name = _random_alphanum_string(num_random_bits, chars=rename_from_set) - tries += 1 - if tries > 20: - raise EZArchiveError('20 random filename selections collided: perhaps you need to increase num_rand_bits?') - seen_rand_names.append(rand_name) - return [rand_name, bytstr] - - input = map(f, input) - input_disposition = 'named_byte_strings' - - if input_disposition == 'named_byte_strings': - # Write the byte strings out to the temporary directory. - temp_dir = tempfile.mkdtemp() - _remember_write(temp_dir, error_only=False) - - if this_root is not None: - # santize: - this_root = os.path.abspath(os.path.expanduser(os.path.normpath(this_root))) - # chop off the root slashes: - this_root = re.sub(r'^/+', '', string=this_root, count=1) - # rejig the root dir to reflect the fact we've shoved - # everything under a psuedo-root temp directory: - this_root = os.path.join(temp_dir, this_root) - - new_input = [] - seen_filenames = {} - - for filename, bytestr in input: - # Sanitize the filename we've been given: - filename = os.path.abspath(os.path.expanduser(os.path.normpath(filename))) - # chop off the root slashes: - filename = re.sub(r'^/+', '', string=filename, count=1) - - dirname = os.path.dirname(filename) - - # Use temp_dir as a 'fake_root': (There is some possible - # dodginess here if the user names one of the files as if - # it were inside the not yet existant temp directory: - # unlikely scenario; should we work around it? I haven't. - _mkdir_parents(os.path.join(temp_dir, dirname)) - - filename = _verify_filename(name=filename, seen_filenames=seen_filenames, - filename_collision=filename_collision, num_random_bits=num_random_bits, - rename_from_set=rename_from_set) - if filename == ['skip']: continue - - tempfile_fullname = os.path.join(temp_dir, filename) - - open(tempfile_fullname, 'wb').write(bytestr) - new_input.append(tempfile_fullname) - - input = new_input - input_disposition='file_locations' - - # At this point, input_disposition='file_locations' and input contains a list of filenames. - - # sanitize the list of filenames - f = lambda x: os.path.abspath(os.path.expanduser(os.path.normpath(x))) - input = map(f, input) - - # Expand any directories into filenames (excluding symlinks): - new_input = [] - for item in input: - if os.path.isdir(item): - new_input.append(item) - if recurse_dirs: - new_input.extend(_recursive_dir_contents(item)) - else: - new_input.append(item) - input = new_input - - # calculate the compression root: - if use_compression_root == 'calculate_minimum': - first_input = input[0] - if input == filter(lambda x: x == first_input, input): - # all of the filenames we've been given are the same: - compression_root = os.path.dirname(first_input) - files_to_compress = [os.path.basename(first_input)] * len(input) - else: - # find out the common root of the filenames: - (compression_root, files_to_compress) = _split_common_path(input) - # if compression_root was also specified in input, it will - # have become a blank entry '' in files_to_compress: - files_to_compress = filter(lambda x: (x != '' and True) or False, files_to_compress) - else: - # use_compression_root == 'this_root': - if this_root is None: - raise EZArchiveError("if compression_root=='this_root' then argument this_root must be specified") - - this_root = os.path.abspath(os.path.expanduser(os.path.normpath(this_root))) - - # check that this_root is indeed a prefix of all of the input - # files we've been given: - if input != filter(lambda file: this_root in _dirtree(file), input): - raise EZArchiveError('not all files specified in argument input are children of argument this_root') - # get rid of the entries that are exactly this_root: - input = filter(lambda file: file != this_root, input) - - compression_root = this_root - - # Chop off this_root from input: - if this_root == '/' or this_root == '//': - this_root_len = len(this_root) - else: - this_root_len = len(this_root + '/') - files_to_compress = map(lambda file: file[this_root_len:], input) - - old_cwd = os.getcwd() - os.chdir(compression_root) - - seen_filenames = {} - for file_to_compress in files_to_compress: - - if directory_structure == 'flatten': - if os.path.isdir(file_to_compress): - continue - - archive_name = os.path.basename(file_to_compress) - - archive_name = _verify_filename(name=archive_name, seen_filenames=seen_filenames, - filename_collision=filename_collision, - num_random_bits=num_random_bits, - rename_from_set=rename_from_set) - if archive_name == ['skip']: continue - - archive.add_member(file_loc=file_to_compress, archive_name=archive_name, - force_file_permissions=force_file_permissions, - force_dir_permissions=force_dir_permissions) - - else: - # directory_structure == 'retain': - archive.add_member(file_loc=file_to_compress, archive_name=None, - force_file_permissions=force_file_permissions, - force_dir_permissions=force_dir_permissions) - - # get rid of the archive object, which has an open - # filehandle, mode 'wb' on the archive file: - # not closing this would prevent us from seeing what - # has been written to the files. - del archive - - # now see if we need to return anything: - if compress_to == 'my_file': - return None - elif compress_to == 'temp_file': - return tf_name - else: - # compress_to == 'byte_string': - return open(archive_fullname, 'rb').read() - except: - # Clean up non-temporary file if we get an error: - _delete_files(_remove_on_error) - raise - finally: - # Always clean up temporary files, error or not: - _delete_files(_remove_always) - try: - os.chdir(old_cwd) - except: - pass - -class ArchiveTool: - - def __init__(self, file_loc, mode, allow_clobber=False): - raise Exception("method must be overided in child class") - - def list_all_members(self): - raise Exception("method must be overided in child class") - # Should return dictionary: - # { filename = - # tar_location = - # new_location = - # file_type = - # } - - def extract_member(self, identity_object, root_dir, dir, new_filename, umask, force_file_permissions=None, - force_dir_permissions=None, allow_clobber=False): - raise Exception("method must be overided in child class") - - def add_member(self, file_loc, archive_name=None, force_file_permissions=None, force_dir_permissions=None): - raise Exception("method must be overided in child class") - -class tarArchiveTool(ArchiveTool): - - # Overide this in child classes tarbz2ArchiveTool and - # targzArchiveTool to make the mode string reflect the required - # compression. - - def _mode_string(string): - return string + ':' - - _mode_string = staticmethod(_mode_string) - - def __init__(self, file_loc, mode, allow_clobber=False): - if mode not in ('r', 'w'): raise ValueError('mode argument must equal "r" or "w"') - - if mode == 'w': - if os.path.exists(file_loc) and not allow_clobber: - raise ClobberError(file_loc) - - # Set adjusted mode to reflect whether we are dealing with a - # tar.gz tar.bz2 or just a tar. - adjusted_mode = self._mode_string(mode) - - self._tarfile_obj = tarfile.open(name=file_loc, mode=adjusted_mode) - self._tarfile_obj.errorlevel=2 - self._mode = mode - self._filename = os.path.basename(file_loc) - self._file_loc = file_loc - - def list_all_members(self): - _check_mode(self._mode, 'r') - - f = lambda tarinfo_obj: { 'name' : os.path.basename(os.path.normpath(tarinfo_obj.name)), - 'dir' : os.path.dirname(os.path.normpath(tarinfo_obj.name)), - 'file_type' : _file_type(tarinfo_obj), - 'identity_object' : tarinfo_obj } - - return map(f, self._tarfile_obj.getmembers()) - - def extract_member(self, identity_object, root_dir, dir, new_filename, umask, force_file_permissions=None, - force_dir_permissions=None, allow_clobber=False): - _check_mode(self._mode, 'r') - - tarinfo_obj = identity_object - - output_location = os.path.join(root_dir, dir, new_filename) - - if os.path.exists(output_location) and not allow_clobber: - raise ClobberError(output_location) - - # Extract the file to the given location. - - saved_name = tarinfo_obj.name - tarinfo_obj.name = os.path.join(dir, new_filename) - saved_mode = tarinfo_obj.mode - tarinfo_obj.mode = _calc_perms(tarinfo_obj.mode, umask) # Apply umask to permissions. - - try: - self._tarfile_obj.extract(tarinfo_obj, root_dir) - except EnvironmentError, e: - if e.errno == 13: - - def f(): - # Have already done this, but permissions might - # have caused a fallacious answer previously: - if os.path.exists(output_location) and not allow_clobber: - raise ClobberError(output_location) - elif os.path.exists(output_location) and allow_clobber: - if os.path.isdir(output_location): - # can ignore dirs; we can overwrite them - # whatever their current perms - pass - else: - # non-write permissions will prevent - # .extract method from overwriting, so - # unlink first: - os.unlink(output_location) - return self._tarfile_obj.extract(tarinfo_obj, root_dir) - - number_dotdot = _count_dotdot(dir) - - if number_dotdot != 0: - # This is the reason why allow_dotdot_paths = True is v. dangerous: - barrier_dir = None -# shunted_root_dir = os.path.join(root_dir, '../' * number_dotdot) -# normed_shunted_root_dir = os.path.normpath(shunted_root_dir) -# barrier_dir = normed_shunted_root_dir - else: - barrier_dir=root_dir - - _provide_dir_with_perms_then_exec(dir=os.path.join(root_dir, dir), function=f, perms=0700, barrier_dir=barrier_dir) - else: - raise - - tarinfo_obj.name = saved_name - tarinfo_obj.mode = saved_mode - - # If we've been asked to force permissions, do so: - type = _file_type(tarinfo_obj) - - if type == 'regular': - if force_file_permissions is not None: - try: - os.chmod(output_location, force_file_permissions) - except EnvironmentError, e: - if e.errno == 13: - f = lambda: os.chmod(output_location, force_file_permissions) - _provide_dir_with_perms_then_exec(dir=os.path.join(root_dir, dir), function=f, perms=0700, barrier_dir=root_dir) - else: - raise - - elif type == 'dir': - if force_dir_permissions is not None: - try: - os.chmod(output_location, force_dir_permissions) - except EnvironmentError, e: - if e.errno == 13: - f = lambda: os.chmod(output_location, force_dir_permissions) - _provide_dir_with_perms_then_exec(dir=os.path.join(root_dir, dir), function=f, perms=0700, barrier_dir=root_dir) - else: - raise - else: - # We don't attempt to play with permissions of special - # file types. - pass - - def add_member(self, file_loc, archive_name=None, force_file_permissions=None, force_dir_permissions=None): - _check_mode(self._mode, 'w') - - if archive_name is None: - archive_name = file_loc - - tarinfo_obj = self._tarfile_obj.gettarinfo(name=file_loc, arcname=archive_name) - - if tarinfo_obj is None: - if WARN_SKIP: - sys.stderr.write("Skipping unsupported file type (eg. socket): %s\n" % (file_loc)) - return None - - if os.path.isdir(file_loc) and force_dir_permissions is not None: - tarinfo_obj.mode = force_dir_permissions - - if os.path.isfile(file_loc) and force_file_permissions is not None: - tarinfo_obj.mode = force_file_permissions - - if tarinfo_obj.isfile(): - self._tarfile_obj.addfile(tarinfo_obj, open(file_loc, 'rb')) - else: - self._tarfile_obj.addfile(tarinfo_obj) - -class targzArchiveTool(tarArchiveTool): - - def _mode_string(string): - return string + ':gz' - - _mode_string = staticmethod(_mode_string) - -class tarbz2ArchiveTool(tarArchiveTool): - - def _mode_string(string): - return string + ':bz2' - - _mode_string = staticmethod(_mode_string) - -class zipArchiveTool(ArchiveTool): - pass - -class CompressTool: - # Use to prevent trying to compress multiple files into the - # unstructured gz file (if you want to do this, use a tar.gz, - # tar.bz2, zip instead!): - _write_protected = False - - def __init__(self, file_loc, mode, allow_clobber=False): - """ - Overided child methods must set class properties: - self._fh - self._filename - self._file_loc - self._mode - """ - - raise Exception("method must be overided in child class") - - def list_all_members(self): - _check_mode(self._mode, 'r') - - uncompressed_filename = re.sub(r'\.' + self._ext + r'$', '', string=self._filename, count=1) - - return [{ 'name' : uncompressed_filename, - 'dir' : '', - 'file_type' : 'regular', - 'identity_object' : None } ] - - def extract_member(self, identity_object, root_dir, dir, new_filename, umask, force_file_permissions=None, - force_dir_permissions=None, allow_clobber=False): - _check_mode(self._mode, 'r') - - output_location = os.path.join(root_dir, dir, new_filename) - - if os.path.exists(output_location) and not allow_clobber: - raise ClobberError(output_location) - elif os.path.exists(output_location) and allow_clobber: - # unlink instead of just overwriting: this makes sure the - # file permissions take the umask into account: - os.unlink(output_location) - - output_fh = open(output_location, 'wb') - output_fh.write(self._fh.read()) - output_fh.close() - - # See if we need to force the file permissions. Otherwise, we - # do nothing, since open call above will have obeyed the - # system umask. - if force_file_permissions is not None: - os.chmod(output_location, force_file_permissions) - - def add_member(self, file_loc, archive_name=None, force_file_permissions=None, force_dir_permissions=None): - - if not os.path.isfile(file_loc): - raise EZArchiveError("%s file format only supports compression of regular files: %s" % (self._ext, file_loc)) - - if not self._write_protected: - input_fh = open(file_loc, 'rb') - self._fh.write(input_fh.read()) - - input_fh.close() - self._fh.close() - - self._write_protected = True - else: - raise EZArchiveError('tried to compress more than one file into a single %s file' % (self._ext)) - -class gzCompressTool(CompressTool): - - def __init__(self, file_loc, mode, allow_clobber=False): - if mode not in ('r', 'w'): raise ValueError('mode argument must equal "r" or "w"') - - if mode == 'w': - if os.path.exists(file_loc) and not allow_clobber: - raise ClobberError(file_loc) - - self._fh = gzip.GzipFile(file_loc, mode=mode+'b') - self._filename = os.path.basename(file_loc) - self._file_loc = file_loc - self._mode = mode - self._ext = 'gz' - -class bz2CompressTool(CompressTool): - def __init__(self, file_loc, mode, allow_clobber=False): - if mode not in ('r', 'w'): raise ValueError('mode argument must equal "r" or "w"') - - if mode == 'w': - if os.path.exists(file_loc) and not allow_clobber: - raise ClobberError(file_loc) - - self._fh = bz2.BZ2File(file_loc, mode=mode+'b') - self._filename = os.path.basename(file_loc) - self._file_loc = file_loc - self._mode = mode - self._ext = 'bz2' - -available_tools = { 'tar' : tarArchiveTool, - 'tar.gz' : targzArchiveTool, - 'tar.bz2' : tarbz2ArchiveTool, - 'zip' : zipArchiveTool, - 'gz' : gzCompressTool, - 'bz2' : bz2CompressTool } - -# Errors: - -class _ModeError(Exception): - """ - This is a private error raised iff there is an attempt to use a - class method that is not allowed by the 'mode' in which the class - instance has been instantiated. Eg. If we have created a - CompressTool in write mode, and we try to use a method intended - only for use in read mode. - - This should only occur in the case of a programming error in the - module. - """ - - pass - -class _NotInArchive(Exception): - """ - A private error raised iff there is an attempt to extract a file - from a given archive that does not exist inside the archive. - - This should only occur in the case of a programming error in the - module. - """ - - pass - -class EZArchiveError(Exception): - - pass - -class ClobberError(EZArchiveError): - - pass - -def tester(tar): - - t = targzArchiveTool(tar, mode='r', allow_clobber=False) - - for mem in t.list_all_members(): - - name = mem['name'] - dir = mem['dir'] - identity_object = mem['identity_object'] - - t.extract_member(identity_object=identity_object, root_dir='/tmp', dir=dir, new_filename=name, - umask=0002, force_file_permissions=None, force_dir_permissions=None, allow_clobber=False) - -def tester2(file): - tar = tarfile.open(file, mode="r:gz") - for tarinfo in tar: - tar.extract(tarinfo, '/tmp/') - tar.close() - - - diff --git a/modules/elmsubmit/lib/elmsubmit_EZEmail.py b/modules/elmsubmit/lib/elmsubmit_EZEmail.py index 57c071566..1fd5d076d 100644 --- a/modules/elmsubmit/lib/elmsubmit_EZEmail.py +++ b/modules/elmsubmit/lib/elmsubmit_EZEmail.py @@ -1,1992 +1,1989 @@ -# -*- coding: utf-8 -*- - -## $Id$ - +# -*- coding: utf-8 -*- +## +## $Id$ +## ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - """ # Side note: CJK codecs at http://cjkpython.i18n.org/. Exports blah blah blah. Speed: Testing it on a random sample of 1500 messages culled from my INBOX, it took an average of 5/100ths seconds to process each message. (Running on a Linux P4 2Ghz machine). Shortcomings: - Does not support message/partial mime type. The message/partial mime type is designed to allow mailers to split up the body of large messages into several 'message/partial' parts, which can then be sent inside seperate email messages to the intended receipient (see RFC2046 for the precise semantics). Upon receipt the MUA is supposed to recombine the parts into the orignal message. Supporting this fairly rare MIME type would add a lot of complexity to the module; there would have to be a mechanism for passing several email messages to the Message class constructor; further, - Simply deletes RFC2231 language specification extensions from RFC2047 encoded header text. The rest of RFC2231 relating to header parameter values is observed. Note that a language specfication is different from a charset. This is an issue in very few circumstances. Hopefully it won't be an issue at all eventually, because soon charsets should allow language specifications to be built into them, and whatever happens in the future to unicode (and accordingly python's unicode object and function) will adapt to this. Quoting from RFC2231: > 8. Character sets which allow specification of language > > In the future it is likely that some character sets will provide > facilities for inline language labeling. Such facilities are > inherently more flexible than those defined here as they allow for > language switching in the middle of a string. > > If and when such facilities are developed they SHOULD be used in > preference to the language labeling facilities specified here. Note > that all the mechanisms defined here allow for the omission of > language labels so as to be able to accommodate this possible future > usage. """ # Quicker to do "from email import *" but this gives us a reminder of # what's available: import email import email.Message import email.Parser import email.Generator import email.Header import email.Charset import email.Encoders import email.MIMENonMultipart import email.MIMEMultipart import email.MIMEMessage import email.MIMEText import email.Utils import email.Errors import mimetypes # Non email imports: import time import datetime import os import StringIO import quopri import uu import base64 import re import rtf.Rtf2Txt from rtf.RtfParser import RtfException as _RtfException import cdsware.elmsubmit_richtext2txt as _richtext2txt import cdsware.elmsubmit_enriched2txt as _enriched2txt import cdsware.elmsubmit_html2txt as _html2txt from cdsware.elmsubmit_misc import concat as _concat from cdsware.elmsubmit_misc import cr2lf as _cr2lf from cdsware.elmsubmit_misc import random_alphanum_string as _random_alphanum_string from cdsware.elmsubmit_misc import wrap_text as _wrap_text from cdsware.elmsubmit_filename_generator import generate_filename as _generate_filename import cdsware.elmsubmit_EZArchive as elmsubmit_EZArchive # Message Parsing: # (Search down to "# Message Creation:" also.) _default_handling_hints = { 'generate_filename' : 'always', 'descend_message_rfc822_attachments' : True, 'archive_format' : 'tar.gz', 'archive_multipart_unrecognized': True, 'archive_multipart_parallel': True, 'archive_multipart_related': True, 'generate_filename' : 'if_missing' } class ParseMessage(object): """ This class provides a very simple representation of an email message. It is basically a wrapper around the class ParseMessage provided by the Email package that comes with Python. It is designed to give simple access to the body text, attachments etc. without the programmer having to consider the evil complexities of MIME. It READ ONLY. ie. Don't expect to produce a new email by mutating the data returned by the class methods! Instance properties: self.headers Returns the headers of the message as a Python dictionary object. The data structure _headers might look as follows: _headers = { 'to' : [u"alice@example.com, bob@example.org"], 'recieved' : [u"from cernfe02.cern.ch ([137.138 etc...", u"from client.cern.ch ([137.138. etc..."], 'date' : [u"Wed, 4 Aug 2004 15:07:17 +0200 (W. Europe Daylight Time)"] } ie. It is a python dictionary: the keys are headers and the values are lists of header values (note that an email (RFC822) message may contain duplicate headers). Each list contains the header values in the relative order they appear in the original message. rfc822 headers are case-insensitive so we store them in lowercase. Header processing decodes each header from its RFC2047 encoded format then further decodes it from the native charsets into unicode. Hence the list values in the data structure are unicode strings. Note that the _keys_ are just regular ascii strings, since we should be able to rely on header keys being 7-bit ascii only. self.received_data self.primary_message Returns the best guess at what the intended message is. self.original_message Returns the orignal message as supplied to the constructor. self.attachments self.inline_attachments self.from_header self.subject self.from_name self.from_email self.message_id self.date_sent_utc Returns an ISO8601 formatted unicode string of the date the email was sent (in UST/GMT). ISO8601 strings look like this: "YYYY-MM-DD HH:MM:SS". If the Date: header is not present or unparsable, the current time on the system (in GMT) is substituted instead. """ def __init__(self, message_string, strict=True, hints=_default_handling_hints): # message_string is a string representing an email (aka rfc822 # message). strict determines +++++++++++++ # Save the original message string (can be accessed later by # self.origMessage method) self.original_message = message_string # Create an email.Message object from the plain text. msg = email.message_from_string(message_string) # Now populate self.headers; intended to be accessed later by # self.headers method. The data structure is described in the # .headers method docstring. self.headers = _process_headers(msg, msg.items(), force_processing=(not strict)) # Now we move on to calculating _from_name, _from_email # Of course, there might not be a From: header in a very # broken email, so we raise a FromHeaderError if this is the # case. try: # KeyError means email is missing 'from' field: from_header = self.headers['from'][0] # If mutliple From: fields, use 1st. self.from_addr = from_header # from_header could be None if we are operating with # strict=False and we failed to decode the header: if from_header is None: raise FromHeaderParsingError(msg) (from_name, from_email) = _parse_from_header(msg, from_header) except KeyError: if strict: raise FromHeaderMissingError(msg) else: from_name = None from_email = None except FromHeaderParsingError: if strict: raise # Reraise the error. else: from_name = None from_email = None self.from_name = from_name self.from_email = from_email try: self.subject = self.headers['subject'][0] except KeyError: self.subject = '' # Should we put None here? try: self.message_id = self.headers['message-id'][0] except KeyError: self.message_id = '' # Should we put None here? # Process the received headers, extracting the 'received from' host and ip address: try: self.received_data = map(_received_ip_and_host, self.headers['received']) except KeyError: # There were no recieved headers; should this be an error # in strict mode or not? # I think not, since people can save email locally without sending it. self.received_data = None # Now calculate _date_sent_utc. Test to see if there actually # is a date header and if we can parse it. If running in # strict mode, then we throw an error. If not, then simply use # the localtime. try: date_in_rfc2822_format = self.headers['date'][0] remote_struct_time_with_utc_offset = email.Utils.parsedate_tz(date_in_rfc2822_format) # email.Utils.parsedate_tz returns None on failure. if remote_struct_time_with_utc_offset is None: raise _ParseDateError() remote_struct_time = remote_struct_time_with_utc_offset[0:9] (remote_offset_from_utc_in_seconds,) = remote_struct_time_with_utc_offset[9:10] if remote_offset_from_utc_in_seconds is None: raise _ParseDateError() except (KeyError, _ParseDateError): if strict: raise ParseDataError(msg) else: # Use local time on error. remote_struct_time = time.gmtime() remote_offset_from_utc_in_seconds = 0 date_time_args = remote_struct_time[0:6] # datetime constructor only needs first 6 parts of struct_time tuple # filter(lambda x: x is None: date_time_args) # if filter != []: raise ParseDateError(msg) remote_time = datetime.datetime(*date_time_args) remote_utc_delta = datetime.timedelta(seconds=remote_offset_from_utc_in_seconds) # local_utc_delta = datetime.timedelta(seconds= -time.timezone) utc_time = remote_time - remote_utc_delta # local_time = utc_time + local_utc_delta # Now that we have the date sent in utc, we just format it to # an ISO8601 string and convert that to a unicode # object. Since the ISO8601 string will only contain us-ascii, # this conversion should not fail and so we need not check for # decoding errors. self.date_sent_utc = unicode(utc_time.isoformat(sep=' '), 'us-ascii') # self._date_sent_local = local_time.isoformat(sep=' ') # Now we parse the email and attempt to calculate what the # primary message (ie. what would pop-up in the message pane # of your email client) would be. (self.attachments, self.inline_attachments, self.primary_message) = _get_msg_structure(msg, strict, hints) def contents(filename): f = file(filename, "r") p = email.Parser.Parser() msg = p.parse(f) def walk_email(msg,indent=0): print "-"*indent, msg.get_content_type() if msg.is_multipart(): for part in msg.get_payload(): walk_email(part,indent+8) walk_email(msg) f.close() ####### __main__ # f = open('blower.eml','r') # e = f.read() # f = open('testSpliter/torture-test.eml','r') # tort = f.read() # f = open('just_text.eml','r') # e2 = f.read() # f = open('hello.eml','r') # e3 = f.read() # f = open('rtf2.eml') # e4 = f.read() # f = open('attached_msg.eml') # e5 = f.read() # f = open('/tmp/eg/example.jpg','r') # jpg = f.read() # f = open('/tmp/eg/example.msword.doc','r') # word = f.read() # f = open('/tmp/eg/example.xls','r') # excel = f.read() # f = open('/tmp/eg/example.pdf','r') # pdf = f.read() # f = open('/tmp/eg/example.reg.gz','r') # reg = f.read() # f = open('/tmp/eg/example.wk3','r') # lotus = f.read() # f = open('/tmp/eg/example.xml','r') # xml = f.read() # f = open('/tmp/eg/example.tar.gz','r') # targz = f.read() # f = open('/tmp/eg/example.tar','r') # tar = f.read() # f = open('/tmp/eg/example.zip','r') # zip_data = f.read() # f = open('/tmp/eg/example.xml.bz2','r') # bz2eg = f.read() # Support functions. def _received_ip_and_host(received_header): host_re = re.compile(r"""from\ ( # from marks the start of the received target [a-z0-9]+(?:[a-z0-9_.-]+[a-z0-9])? # Match a domain string. ) # Allow illegal but common underscores # (eg. the famous dear_raed.blogspot.com) [)\s] # Terminate with space or a closing bracket depending on the format. """,re.VERBOSE|re.IGNORECASE) ipad_re = re.compile(r"""[[(] # match opening bracket or parenthesis # (should be a bracket if following standards) ((?:\d{1,3}\.){3} # match three octets with dots \d{1,3}) # match a single octet with no dot [])] # match the closing bracket/parenthesis """, re.VERBOSE|re.IGNORECASE) host_match = host_re.search(received_header) if host_match is not None: host = host_match.group(1) else: host = None ipad_match = ipad_re.search(received_header) if ipad_match is not None: ipad = ipad_match.group(1) else: ipad = None return (host, ipad) def _basic_email_info(msg): """ Takes an email.Message object and returns a dictionary, formatted like the following example, containing a basic subset of information about the message: { from: u'Ann Other ' from_email : u'person@example.org', from_name : u'Ann Other', subject : u'This email is about...', message-id : u'1234567890@host.example.com', } Any header which cannot be decoded to unicode will be returned in it original encoded form. Check with type(value) = unicode. This function can be used when throwing an error to gather just enough information about the message so clients of the elmsubmit_EZEmail.ParseMessage class can respond to the email author reporting the error. """ # Items we want to try and return: # If you wish to tailor this list, note that basic_headers MUST # have 'from' in it, otherwise the following code breaks! basic_headers = ['from', 'subject', 'message-id'] # 'from_name' and 'from_email' aren't headers; # they are derivatives of the 'from' header. # The hash to be built up and returned: return_dict = {} # Get all header/value pairs for which the header is also in list # basic_headers (case insensitively): f = lambda (k,v): k.lower() in basic_headers basic_items = filter(f, msg.items()) # Now attempt to decode the basic headers to unicode objects: basic_decoded_headers = _process_headers(msg=None, header_value_pairs=basic_items, force_processing=True) # Since we're just using this for error output, we don't need to # worry about headers with the same header key; just accept the # first one present (and note that the list of headers in # basic_headers are all ones which _should_ only be appearing # once). g = lambda (k,v): (k,v[0]) basic_decoded_headers = dict(map(g, basic_decoded_headers.items())) try: # If the from header is missing this access will cause # KeyError: from_value = basic_decoded_headers['from'] # If from_header is None, we couldn't decode it and so can't # proceed in splitting it into from_name and from_email. Raise # TypeError. if from_value is None: raise TypeError # Could cause FromHeaderParsingError: (from_name, from_email) = _parse_from_header(msg, from_value) return_dict.update({ 'from_name': from_name, 'from_email': from_email }) except (TypeError, KeyError, FromHeaderParsingError): return_dict.update({ 'from_name': None, 'from_email': None }) # This loops over basic_headers and tries to index # basic_decoded_headers by each value. Anything that isn't present # (ie. we've failed to decode), we look up directly in the orginal # msg object (and return the value as a string in whatever charset # and RFC2047 encoding it arrived in) if _that_ fails (ie. the # header is missing from the message altogether) we set the value # in the hash to None. for header in basic_headers: value = basic_decoded_headers.get(header, None) if value == None: value = msg.get(header, None) return_dict[header] = value return return_dict def _native2unicode(value_nc, native_charset=None, strict=True): """ Function native2unicode is a wrapper around builtin function unicode. The difference is that native2unicode will accept a charset of None which will cause it to default to decoding from us-ascii. It also raises a custom error _UnicodeDecodingError which returns the problem value and charset, rather than LookupError/ValueError raised by the unicode builtin. """ errors = { True : 'strict', False: 'replace'}[strict] # Non-RFC2047 encoded parts return charset as None; we assume then # that they are us-ascii bytes. if native_charset is None: native_charset = 'us-ascii' # Remove RFC2123 language specification from document if present. # This is delimited from the charset by the '*' character. eg. We # might have # native_charset = 'us-ascii*en' # and we need to remove '*en'. # This is the key reason we have function _native2unicode, and # aren't just calling .decode(charset)! native_charset = re.sub(r'\*.*$', '', native_charset) # Search this document for RFC2123 for more information. # unicode function might not recognize the native_charset and # hence throw a LookupError. Or it might fail to do the conversion # and throw a UnicodeError try: return unicode(value_nc, native_charset, errors) except (LookupError, UnicodeError): raise _UnicodeDecodingError(value_nc, native_charset) def _process_headers(msg, header_value_pairs, force_processing=False): f = lambda headers, (header,value): _process_header(msg, headers, header, value, force_processing) return reduce(f, header_value_pairs, {}) def _decode_rfc2231_tuple(msg, value): try: try: (charset, lang_specification, encoded_value) = value # If charset is unspecified, then we can assume us-ascii: if charset == '': charset = 'us-ascii' return _native2unicode(encoded_value, charset, strict=True) except ValueError: # Data was not RFC2231 encoded. ie. value should be just an # ascii-string. # Note however that some broken email clients mistakenly use # RFC2047 encoding in parameterized header fields such as # Content-Type and Content-Disposition. This is disallowed by # RFC2047: # > + An 'encoded-word' MUST NOT be used in parameter of a # > MIME Content-Type or Content-Disposition field, or in any # > structured field body except within a 'comment' or # > 'phrase'. # In order to support these clients, if we get a string back # which wasn't RFC2231 encoded, we check instead to see if it # can be RFC2047 decoded. # Note that header='rfc2231_param' is just a dummy value; # we're not really decoding an rfc2047 encoded header; # just trying to support clients that mistakenly use # rfc2047 encoding for parameters _within_ structured # headers. return _decode_rfc2047_header(msg, header='rfc2231_param', value=value) except (_UnicodeDecodingError, HeaderCharsetError, HeaderRFC2047Error): return None def _decode_and_join_structured_header_pair(msg, key, value): # Take input that looks like this: # # key = 'title' # value = ('us-ascii', 'en', "This is even more ***fun*** isn't it!") # # And return it looking like this: # # u'title="This is even more ***fun*** isn't it!"' # The key should always be just a us-ascii string: try: decoded_key = _native2unicode(key, 'us-ascii') except _UnicodeDecodingError: raise _StructuredHeaderPairError(key, value) if value == '': # We have a structured entry that is not in key=value form. eg. The multipart/mixed in # 'Content-Type: multipart/mixed; boundary="------------050902070901080909090201"' return decoded_key else: decoded_value = _decode_rfc2231_tuple(msg, value) if decoded_value is None: raise _StructuredHeaderPairError(key, value) # Now escape string for addition of quotes either side: # Escape backslashes: decoded_value = re.sub(r'\\', r'\\\\', decoded_value) # Escape quotes: decoded_value = re.sub(r'"', r'\\"', decoded_value) return decoded_key + '="' + decoded_value + '"' def _decode_rfc2231_header(msg, header, value, force_processing=False): # We get the key/value pairs from the structured header by calling # the email.ParseMessage class's get_params method. This method deals # with all of the RFC2231 decoding for us, and so we just have to # reconstruct the tuples it gives us into a unicode string. # This means these headers are no longer suitable for parsing by # machine, but does make them suitable for display (which is # prefered from two mutually incompatable options; if you want to # start parsing Content-Type parameters, then you want to be using # the Python email package directly!). # Take a value that looks like: # value = # And turn it into a unicode string that looks like: # u'"This is even more ***fun*** isn't it!"' # The values in the tuple are from left to right are a charset, # language specification and string of encoded text. # We ignore the language specification. See list of module # shortcomings. # """ params = msg.get_params(None, header) # param should never return failobj None since we have already # verified the header we are requesting exists. if params is None: raise _EmailPackageError try: f = lambda (k,v): _decode_and_join_structured_header_pair(msg, k, v) joined_pairs = map(f, params) unicode_value = '; '.join(joined_pairs) except _StructuredHeaderPairError, e: if force_processing: unicode_value = None else: raise HeaderRFC2231Error(msg, header, value, e.key, e.value) return unicode_value def _decode_rfc2047_header(msg, header, value, force_processing=False): """ Take an rfc2047 encoded string and convert it to unicode. """ # For each header value two decoding steps happen: # 1. We decode the header from its RFC2047 encoded format. # 2. We decode the resulting information from its native # charset to a unicode object. # decode_header takes the RFC2047 encoded form and returns a list # of tuples (string_in_native_charset, charset_name). It is a # *list* not a single tuple, since it is permissible to use # multiple charsets in a single header value! # Although undocumented in the python library documentation, # looking at the email.Header source suggests decode_header might # raise an 'email.Errors.HeaderParseError'. We catch this and # raise our own 'HeaderRFC2047Error'. try: decoded_parts = email.Header.decode_header(value) # The _native2unicode function might not recognise one of the # charsets and so throw a private _UnicodeDecodingError. If we # get one, then we catch it and raise public error # "HeaderCharsetError". f = lambda (value, charset): _native2unicode(value, charset, not force_processing) unicode_decoded_parts = map(f, decoded_parts) # Since all members of decoded_parts are now in unicode we can # concatenate them into a single header value string. unicode_value = u''.join(unicode_decoded_parts) except email.Errors.HeaderParseError: if force_processing: unicode_value = None else: raise HeaderRFC2047Error(msg, header, value) except _UnicodeDecodingError, e: if force_processing: unicode_value = None else: raise HeaderCharsetError(msg, header, value, e.value, e.charset) return unicode_value def _process_header(msg, headers, header, value, force_processing=False): # Function _process_header takes a partial headers dictionary # and a header/value pair and updates the dictionary with this # pair. # Headers are decoded from their RFC2231 encoding and turned into # unicode strings. # For Content-Type and Content-Disposition headers only, an # alternative decoding step happens; we attempt to decode these # structured headers from their RFC2231 encoding and rebuild them # as unicode strings. if header.lower() in ('content-type', 'content-disposition'): unicode_value = _decode_rfc2231_header(msg, header, value, force_processing) else: unicode_value = _decode_rfc2047_header(msg, header, value, force_processing) # Repeated header keys are legal, so we store dictionary # values as a list. Therefore we must check if this header # key has already been initialized in the dictionary. header = header.lower() headers.setdefault(header, []) # If key header isn't present, add it with value [] headers[header].append(unicode_value) return headers def _parse_from_header(msg, from_header): ### !!! Need to do some thinking about internationalized email ### !!! addresses and domain names to check what problems ### !!! these may cause. (from_name, from_email) = email.Utils.parseaddr(from_header) # Check we were able to parse the From: field # (email.Utils.parseaddr returns ('','') on failure) and that # from_email is not empty. Otherwise raise a FromHeaderParsingError # empty from_name is OK, since we just use from_email as the # author's 'name'. if (from_name, from_email) == ('','') or from_email == '': raise FromHeaderParsingError(msg) elif from_name == '': from_name = from_email return (from_name, from_email) def _get_msg_structure(msg, strict, hints): mime_helper = _MimeHelper(msg) # mime_helper.maintype = 'multipart' # mime_helper.subtype = 'parallel' mime_handler = _get_mime_handler(mime_helper.maintype, mime_helper.subtype) nominal_attachments = mime_handler(msg, mime_helper, strict, hints) attachments = [] inline_attachments = [] primary_msg = '' for item in nominal_attachments: if item['disposition'] != 'attachment': if item['maintype'] == 'text' and item['downgrading_to_text'] is None: primary_msg += _force_ends_in_newline(item['file']) inline_attachments.append(item) elif item['downgrading_to_text'] is not None: primary_msg += _force_ends_in_newline(item['downgrading_to_text']) inline_attachments.append(item) else: attachments.append(item) else: attachments.append(item) return (attachments, inline_attachments, primary_msg) def _force_ends_in_newline(string): if string == '' or string[-1] != '\n': return string + '\n' else: return string def _get_mime_handler(maintype, subtype): try: handler = _mime_handler_map[maintype][subtype] except KeyError: try: handler = _mime_handler_map_unrecognized_subtype[maintype] except KeyError: handler = _mime_handler_unrecognized_maintype # Create a 'wrapper' function which does preparatory checks we # want to happen for all mime type before executing the real mime # handler (possibly we could have used a class based approach to # allow for more levels of wrapping, but I think this may have # been a sledgehammer on nut): def parent_handler(msg, mhe, strict, hints): if mhe.decoded_payload is None: # and not mhe.msg_part.is_multipart(): if strict: raise MIMEPartError(msg, mhe, 'cte_decoding') else: return [] if mhe.filename == ['FilenameDecodingError']: if strict: raise MIMEPartError(msg, mhe, 'filename_decoding') else: mhe.filename = None if maintype == 'text': # Make sure text data has unix newline conventions. mhe.decoded_payload = _cr2lf(mhe.decoded_payload) try: mhe.file = _native2unicode(mhe.decoded_payload, mhe.charset, strict) except _UnicodeDecodingError: if strict: raise MIMEPartError(msg, mhe, 'unicode_conversion') else: return [] return handler(msg, mhe, strict, hints) return parent_handler # Generate filename values: never, always, if_missing def _format_msg_part_data(mhe, hints): part_info = {} part_info['file'] = mhe.file #'UNCOMMENT THIS TO SEE FILE!!!!!' #mhe.file part_info['downgrading_to_text'] = mhe.downgrading_to_text part_info['maintype'] = mhe.maintype part_info['subtype'] = mhe.subtype part_info['filename'] = mhe.filename part_info['disposition'] = mhe.disposition if mhe.maintype == 'text': part_info['original_charset'] = mhe.charset part_info['signature'] = mhe.signature part_info['encrypted'] = mhe.encrypted part_info['mac_resource_fork'] = mhe.mac_resource_fork part_info['rejected_alternatives'] = mhe.rejected_alternatives # Now see if we need to generate a filename: gf = hints['generate_filename'] if gf == 'always' or (gf == 'if_missing' and (mhe.filename is None or mhe.filename == '')): generated_filename = _generate_filename(file=mhe.decoded_payload, content_type=(mhe.maintype + '/' + mhe.subtype)) else: generated_filename = None part_info['generated_filename'] = generated_filename return part_info def _get_part_disposition(msg_part): """ Look to see whether this part is designated as inline, attachment or something else. """ # BNF of Content-Disposition header quoted from RFC2183: # disposition := "Content-Disposition" ":" # disposition-type # *(";" disposition-parm) # # disposition-type := "inline" # / "attachment" # / extension-token # ; values are not case-sensitive # The BNF states only "inline" or "attachment" are valid tokens # for disposition-type, so there is now need to worry about # RFC2231 encoded data. (And any extension tokens should be # similarly restricted to simple ascii). # This dictates that the disposition-type must be the first element in # the header. get_params returns something like this: # >>> msg_part.get_params(None, 'Content-Disposition') # [('inline', ''), ('filename', 'email.txt')] # So we have to index by [0][0] to get the disposition-type keyword. try: return msg_part.get_params(None, 'Content-Disposition')[0][0] except (TypeError, IndexError): return None def _get_part_filename(msg_part): """ Attempt to discover a filename associated with a message body part. Note that the filename, if it exists, is returned as a unicode string. Filenames may not be as simple as you expect; what you get may be a string of Arabic characters. """ # Note, we could just use the email.ParseMessage method get_filename to # try and discover a filename. However, this only checks the # Content-Disposition header for the filename parameter whereas we # would like to support crufty old clients which are still using # the Content-Type name parameter. missing = [] #First try content-disposition: filename = msg_part.get_param('filename', missing, 'content-disposition') if filename != missing: filename = _decode_rfc2231_tuple(msg_part, filename) if filename is None: return ['FilenameDecodingError'] else: return filename else: # If filename parameter of content-disposition is not # available, try name parameter of content-type: filename = msg_part.get_param('name', missing, 'content-type') if filename != missing: filename = _decode_rfc2231_tuple(msg_part, filename) if filename is None: return ['FilenameDecodingError'] else: # No filename available: return None def _cte_decode(msg_part): """ Return a message part's payload, decoded from its content transfer encoding. """ # Note that it is possible to use # msg_part.get_payload(decode=True) to do the CTE # decoding. Unfortunately, the error reporting of this method is # not very helpful; if CTE decoding fails, it just returns the # undecoded payload. This makes it hard to tell if there has been # success or not. Initially I thought decoding failure could be # identified by checking if: # msg_part.get_payload(decode=True) == msg_part.get_payload(decode=False) # But this method would flag false errors. For example, if text # contains no 'nasty characters' it will be the same both before # and after quoted-printable encoding (see the quoted-printable # RFC for a definition of 'nasty characters!); ie. for such text # quoted-printable encoding is an identity map. # The following function is essentially a cut-and-paste of the # Email.Message.Message class's get_payload method, edited to # raise _CTEDecodingError upon decoding failure. # In the case we are passed a multipart message part, we cannot # decode it and so throw _MultipartCTEDecodingAttempt: if msg_part.is_multipart(): raise _MultipartCTEDecodingAttempt payload = msg_part.get_payload(decode=False) cte = msg_part.get('content-transfer-encoding', '').lower() try: if cte == 'quoted-printable': # Could cause binascii.Error/Incomplete payload = quopri.decodestring(payload) elif cte == 'base64': # Could cause payload = _base64decode(payload) elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'): sfp = StringIO() uu.decode(StringIO(payload+'\n'), sfp) payload = sfp.getvalue() except (binascii.Error, binascii.Incomplete, uu.Error): raise _CTEDecodingError return payload def _base64decode(s): """ Decode bas64 encoded string. """ # This is a cut and paste of email.Utils._bdecode. We can't call # _bdecode directly, because its a private function of the Utils # modules and therefore not safe to import. # We can't quite use base64.encodestring() since it tacks on a "courtesy # newline". Blech! if not s: return s value = base64.decodestring(s) if not s.endswith('\n') and value.endswith('\n'): return value[:-1] return value # MIME Handlers: def _get_flattened_payload(msg_part, with_mime_headers=False): flattened_data = msg_part.as_string(unixfrom=False) # if with_mime_headers is False, then remove them: if not with_mime_headers: # Regex should remove from the start of the string up to the # first double newline '\n\n', with possibly space in-between # the newlines. This should chop of the mime_headers. # (?s) in the regex sets the DOTALL flag; ie. '.' matches everything including newline. flattened_data = re.sub(r'(?s)(.*?\n\n)', r'', flattened_data, count=1) return flattened_data def _email_structure_to_directory_structure(email_structure, directory_base=''): files = [] used_random = {} alt_part_number = 0 for item in email_structure: try: # Is it a list? (ie. a list of alternative parts): item.append # Does this throw AttributeError? If not, it is list like. alt_part_number += 1 files.extend(_email_structure_to_directory_structure(email_structure=item, directory_base=os.path.join(directory_base, ('alternative_data_' + str(alt_part_number))))) except AttributeError: # Or a dictionary? (ie. an actual part): possible_filenames = [item['filename'], item['generated_filename'], 'unamed_part_' + _get_unused_random(lambda: _random_alphanum_string(length=8), used_random)] available_filenames = filter(lambda x: x is not None, possible_filenames) filename = available_filenames[0] if item['file'] is not None: files.append((os.path.join(directory_base, filename), item['file'])) else: files.append((filename, '')) if item['downgrading_to_text'] is not None: files.append((os.path.join(directory_base, filename + '.txt'), item['downgrading_to_text'])) return files def _archive_part(msg, mhe, strict, hints): processed_parts = _get_mime_handler('multipart', 'mixed')(msg, mhe, strict, hints) files = _email_structure_to_directory_structure(processed_parts) mhe.file = elmsubmit_EZArchive.create(files, input_disposition='named_byte_strings', compress_to='byte_string', compression=hints['archive_format'], force_file_permissions=0664, force_dir_permissions=0775) mhe.filename = '_'.join([_random_alphanum_string(length=8), mhe.maintype, mhe.subtype, 'archive.' + hints['archive_format']]) def _pick_from_alternatives(processed_parts): processed_parts.sort(multipart_alternative_sort) return (processed_parts[0], processed_parts[1:]) def multipart_alternative_sort(part1, part2): # We deal with multipart/alternative by prefering in descending # order: # text/plain, score 6 # text/html, score 5 # text/enriched, score 4 # text/richtext, score 3 # text/rtf, score 2 # application/rtf, score 1 # (the later five we make use of their 'downgrading_to_text') # Although text/richtext is a simpler format than text/html, it # has theoretically been obsoleted by text/enriched and so comes # lower in order of preference. # A note on the rich text mess: "Why are their four types of rich # text? Surely they're all the same thing?" Unfortunately not... # - RFC1341 defines a simple text markup for mime type # 'text/richtext'. # - RFC1896 (and some RFCS before it which 1896 obsoletes) defines # 'text/enriched' which is designed to solve the shortcomings of # 'text/richtext'; use of 'text/richtext' is deprecated in # favour of 'text/enriched' # - 'text/rtf' and 'application/rtf' refer to Microsoft's RTF file # format, and are not specified in any RFC (that I know of). They # are the same file format; it's just that the registration got # duplicated (people weren't sure whether to describe rtf as a # plaintext format; ie. readable when unparsed by humans, or # application (ie. needs to be parsed to make any sense of)! # Some useful reading: # http://mango.human.cornell.edu/kens/etf.html (text/enriched primer) # http://www.faqs.org/rfcs/rfc1896.html (text/enriched RFC) # http://www.faqs.org/rfcs/rfc1341.html (text/richtext RFC) # News message ID: <199306081944.AA13622@mudhoney.micro.umn.edu> # (the thread this message sits in contains the # registrations of text/rtf and application/rtf) liked_formats = [ 'text/plain', 'text/html', 'text/enriched', 'text/richtext', 'text/rtf', 'application/rtf' ] scorecard = dict(zip(liked_formats, range(len(liked_formats), 0, -1))) # Create something that looks like this: # {'application/rtf': 1, # 'text/enriched': 4, # 'text/html': 5, # 'text/plain': 6, # 'text/richtext': 3, # 'text/rtf': 2} # Doing the calculation instead of hardcoding allows liked_formats # to be rearranged more easily! # Part types not in this list get score 0. score1 = scorecard.get(part1['maintype'] + '/' + part1['subtype'], 0) score2 = scorecard.get(part2['maintype'] + '/' + part2['subtype'], 0) # We want the list in reverse order, big down to small: return cmp(score2, score1) def _get_unused_random(rand_function, used_random): r = rand_function() while used_random.has_key(r): r = rand_function() used_random[r] = True return r class _MimeHelper(object): def __init__(self, msg_part): self.msg_part = msg_part self.maintype = msg_part.get_content_maintype() self.subtype = msg_part.get_content_subtype() if self.maintype == 'text': self.charset = msg_part.get_content_charset('us-ascii') else: self.charset = None self.disposition = _get_part_disposition(msg_part) self.filename = _get_part_filename(msg_part) self.signed = False self.signature = None self.encrypted = False self.mac_resource_fork = None self.downgrading_to_text = None self.rejected_alternatives = None if msg_part.is_multipart(): # If multipart, get the flattened payload. self.decoded_payload = _get_flattened_payload(msg_part, with_mime_headers=False) else: # If its not multipart, attempt CTE decoding and store # result: try: self.decoded_payload = _cte_decode(msg_part) except (_CTEDecodingError, _MultipartCTEDecodingAttempt): self.decoded_payload = None def _mime_handler_application_applefile(msg, mhe, strict, hints): return [{mhe.maintype : 'not implemented yet'}] def _mime_handler_application_octetstream(msg, mhe, strict, hints): # application/octet-stream requires no special handling. All of # the necessary work has been done in the parent handler. mhe.file = mhe.decoded_payload return [ _format_msg_part_data(mhe, hints) ] def _mime_handler_application_pgpencrypted(msg, mhe, strict, hints): return [{mhe.maintype : 'not implemented yet', mhe.subtype : 'problems!'}] def _mime_handler_application_pgpkeys(msg, mhe, strict, hints): return [{mhe.maintype : 'not implemented yet', mhe.subtype : 'problems!'}] def _mime_handler_application_pgpsignature(msg, mhe, strict, hints): return [{mhe.maintype : 'not implemented yet', mhe.subtype : 'problems!'}] def _mime_handler_application_rtf(msg, mhe, strict, hints): # application/rtf is same as text/rtf, so call _get_mime_handler # to retrieve correct handler. # Note that we can't just execute _mime_handler_text_rtf directly, # because this would miss the neccessary parent_handler code, # which depends on knowing if maintype is 'text' (which the # misregistered application/rtf would hide): return _get_mime_handler('text', 'rtf')(msg, mhe, strict, hints) def _mime_handler_message_externalbody(msg, mhe, strict, hints): return [{mhe.maintype : 'not implemented yet', mhe.subtype : 'problems!'}] # def _mime_handler_message_news(msg, mhe, strict, hints): # pass # Currently just treat as application/octet-stream. def _mime_handler_message_partial(msg, mhe, strict, hints): if strict: raise MIMEPartError(msg, mhe, 'not_implemented') else: return [] def _mime_handler_message_rfc822(msg, mhe, strict, hints): if not hints['descend_message_rfc822_attachments']: # Treat as a binary attachment. return _get_mime_handler('application', 'octet-stream')(msg, mhe, strict, hints) else: # Descend into the message as if it were a multipart/mixed # type. return _get_mime_handler('multipart', 'mixed')(msg, mhe, strict, hints) def _mime_handler_multipart_alternative(msg, mhe, strict, hints): # We handle multipart alternative just like multipart mixed, but # then pick our prefered alternative, storing the remaining # alternatives in mhe.rejected_alternatives. (prefered, rejects) = _pick_from_alternatives(_get_mime_handler('multipart', 'mixed')(msg, mhe, strict, hints)) prefered['rejected_alternatives'] = rejects return [ prefered ] def _mime_handler_multipart_appledouble(msg, mhe, strict, hints): return [{mhe.maintype : 'not implemented yet', mhe.subtype : 'problems!'}] def _mime_handler_multipart_encrypted(msg, mhe, strict, hints): return [{mhe.maintype : 'not implemented yet', mhe.subtype : 'problems!'}] def _mime_handler_multipart_mixed(msg, mhe, strict, hints): # We ignore Content-Disposition for multipart/mixed parts, as want # to process them the same regardless. # Generate mime helpers for each part of the multipart collection: mime_helpers = map(_MimeHelper, mhe.msg_part.get_payload()) # Get a mime handler for each part, and execute it: f = lambda mhe: _get_mime_handler(mhe.maintype, mhe.subtype)(msg, mhe, strict, hints) list_of_lists_of_processed_parts = map(f, mime_helpers) # Flatten the results: return _concat(list_of_lists_of_processed_parts) def _mime_handler_multipart_signed(msg, mhe, strict, hints): return [{mhe.maintype : 'not implemented yet', mhe.subtype : 'problems!'}] def _mime_handler_multipart_unrecognized(msg, mhe, strict, hints): if hints['archive_multipart_unrecognized']: _archive_part(msg, mhe, strict, hints) return [ _format_msg_part_data(mhe, hints) ] else: # Descend into the message as if it were a multipart/mixed # type. return _get_mime_handler('multipart', 'mixed')(msg, mhe, strict, hints) def _mime_handler_multipart_parallel(msg, mhe, strict, hints): if hints['archive_multipart_parallel']: _archive_part(msg, mhe, strict, hints) return [ _format_msg_part_data(mhe, hints) ] else: # Descend into the message as if it were a multipart/mixed # type. return _get_mime_handler('multipart', 'mixed')(msg, mhe, strict, hints) def _mime_handler_multipart_related(msg, mhe, strict, hints): if hints['archive_multipart_related']: _archive_part(msg, mhe, strict, hints) return [ _format_msg_part_data(mhe, hints) ] else: # Descend into the message as if it were a multipart/mixed # type. return _get_mime_handler('multipart', 'mixed')(msg, mhe, strict, hints) def _mime_handler_text_enriched(msg, mhe, strict, hints): # Covert the text/enriched data to plain text and store it: # mhe.file is already a unicode string. # enriched2txt function doesn't have any public errors: mhe.downgrading_to_text = _enriched2txt.enriched2txt(_native2unicode(mhe.decoded_payload, native_charset=mhe.charset, strict=strict)) return [ _format_msg_part_data(mhe, hints) ] def _mime_handler_text_html(msg, mhe, strict, hints): # Covert the text/richtext data to plain text and store it. We # pass richtext2txt the original non-unicode text string and it # will pass us back a unicode string: try: # html2txt expects unicode in, and spits unicode out: mhe.downgrading_to_text = _html2txt.html2txt(_native2unicode(mhe.decoded_payload, native_charset=mhe.charset, strict=strict), cols=72) except _html2txt.HTMLParsingFailed: if strict: raise MIMEPartError(msg, mhe, 'downgrading_to_text') else: mhe.downgrading_to_text = None return [ _format_msg_part_data(mhe, hints) ] def _mime_handler_text_plain(msg, mhe, strict, hints): return [ _format_msg_part_data(mhe, hints) ] def _mime_handler_text_richtext(msg, mhe, strict, hints): # Covert the text/richtext data to plain text and store it. We # pass richtext2txt the original non-unicode text string and it # will pass us back a unicode string: try: # richtext2txt always returns unicode for us: mhe.downgrading_to_text = _richtext2txt.richtext2txt(mhe.decoded_payload, charset=mhe.charset, convert_iso_8859_tags=True, force_conversion=(not strict)) except _richtext2txt.RichTextConversionError: if strict: raise MIMEPartError(msg, mhe, 'downgrading_to_text') else: mhe.downgrading_to_text = None return [ _format_msg_part_data(mhe, hints) ] def _mime_handler_text_rtf(msg, mhe, strict, hints): # Note: This parser has some unicode issues which need to be # fixed! The project seems fairly active... # Use RtfLib to convert rtf string to text. try: mhe.downgrading_to_text = rtf.Rtf2Txt.getTxt(_native2unicode(mhe.decoded_payload, native_charset=mhe.charset, strict=strict)) except _RtfException: if strict: raise MIMEPartError(msg, mhe, 'downgrading_to_text') else: mhe.downgrading_to_text = None return [ _format_msg_part_data(mhe, hints) ] # Content-Type to Handler mappings: _mime_handler_map_application = { 'applefile' : _mime_handler_application_applefile, 'octet-stream' : _mime_handler_application_octetstream, 'pgp-encrypted' : _mime_handler_application_pgpencrypted, 'pgp-keys' : _mime_handler_application_pgpkeys, 'pgp-signature' : _mime_handler_application_pgpsignature, 'rtf' : _mime_handler_application_rtf } _mime_handler_map_audio = { } # No special audio handlers defined. _mime_handler_map_image = { } # No special image handlers defined. _mime_handler_map_message = { 'external-body' : _mime_handler_message_externalbody, # 'news' : _mime_handler_application_octetstream, 'partial' : _mime_handler_message_partial, # not supported! 'rfc822' : _mime_handler_message_rfc822 } _mime_handler_map_model = { } # No special models handlers defined. _mime_handler_map_multipart = { 'alternative' : _mime_handler_multipart_alternative, 'appledouble' : _mime_handler_multipart_appledouble, 'encrypted' : _mime_handler_multipart_encrypted, 'mixed' : _mime_handler_multipart_mixed, 'parallel' : _mime_handler_multipart_parallel, 'related' : _mime_handler_multipart_related, 'signed' : _mime_handler_multipart_signed } _mime_handler_map_text = { 'enriched' : _mime_handler_text_enriched, 'html' : _mime_handler_text_html, 'plain' : _mime_handler_text_plain, 'richtext' : _mime_handler_text_richtext, 'rtf' : _mime_handler_text_rtf } _mime_handler_map_video = { } # No special video handlers defined. _mime_handler_map = { 'application' : _mime_handler_map_application, 'audio' : _mime_handler_map_audio, 'image' : _mime_handler_map_image, 'message' : _mime_handler_map_message, 'model' : _mime_handler_map_model, 'multipart' : _mime_handler_map_multipart, 'text' : _mime_handler_map_text, 'video' : _mime_handler_map_video } # Unrecognized types are handled according to the recomendations of # RFC2046 which mandates that unrecognized parts of given maintype be # dealt with as follows: # application -> application/octet-stream # audio -> application/octet-stream # image -> application/octet-stream # message -> application/octet-stream # model -> application/octet-stream # In the multipart case, however, we give the module client two # choices of how to treat unrecognized multipart sections: either as # multipart/mixed, or to wrap up each of the sub-parts into a tar.gz # and present this as if it had been a single attachment. # multipart -> multipart/mixed # text -> text/plain # video -> application/octet-stream _mime_handler_map_unrecognized_subtype = { 'application' : _mime_handler_application_octetstream, 'audio' : _mime_handler_application_octetstream, 'image' : _mime_handler_application_octetstream, 'message' : _mime_handler_application_octetstream, 'model' : _mime_handler_application_octetstream, 'multipart' : _mime_handler_multipart_unrecognized, 'text' : _mime_handler_text_plain, 'video' : _mime_handler_application_octetstream } _mime_handler_unrecognized_maintype = _mime_handler_application_octetstream # Message Creation: # Whereas ParseMessage is a class, CreateMessage is just a function # which returns the email as an ascii byte string. # Creation is an order of magnitude simpler than parsing. When parsing # we have to try and be able to cope with everything seen out 'in the # wild'. With creation, we can simply restrict what is allowed to be # created to a sensible set of options. # CreateMessage restricts you to a single plain text body plus any # number of attached files and any number of attached emails. This # will all be stuffed into a single multipart/mixed container (unless # there is only a single part to be added, in which case we skip the # multipart/mixed container). This is how email should be sent by good # internet citizens. If this doesn't fit your needs, then your needs # are esoteric (and if you want to send html email, then you're just # plain evil)! def CreateMessage(_from, to, subject, cc=None, bcc=None, message=None, attach_messages=[], attach_files=[], message_id=None, references=None, in_reply_to=None, date=None, wrap_message=False, cols=80): """ Returns a byte string containing the email constructed from the following arguments: _from: Either: 1. An ascii string already suitable for inclusion in this email header (eg. a string you have torn directory out of another email. 2. A 2-tuple (name, email_address), where name is a persons name and email_address is their email address. name must be a unicode object. email_address can be either a unicode object or a byte string. to, cc, bcc: Either: 1. An ascii string already suitable for inclusion in this email header (eg. a string you have torn directory out of another email. 2. A _list_ of items defined in the same way as _from option 1. subject: Either: 1. An ascii string already suitable for inclusion in this email header (eg. a string you have torn directory out of another email. 2. A unicode object. message: A unicode object containing what will be the message body text. attach_files: A list of 2-tuples, (filename, open_file_object) where filename must be a unicode object and open_file_object must be an open python file object in mode 'rb'. message_id: An ascii string containing a message-id. references: A list of objects defined like argument message_id. in_reply_to: A list of objects defined like argument message_id. date: A ascii string containing an rfc822 formatted date string. wrap_message: True/False whether you want to have the message body wrapped to the width given in argument cols. cols: A integer column width. """ if message is not None: mime_message = [_mimeify_message(message, wrap_message, cols)] else: mime_message = [] mime_attached_messages = map(_mimeify_attach_message, attach_messages) mime_attached_files = map(_mimeify_attach_file, attach_files) mime_parts = mime_message + mime_attached_messages + mime_attached_files if mime_parts == []: raise EZEmailCreateError("At least one of message, attach_messages or attach_files must be specified.") elif len(mime_parts) == 1: # Only one payload, so don't need multipart. main_part = mime_parts[0] else: main_part = email.MIMEMultipart.MIMEMultipart() map(main_part.attach, mime_parts) main_part.preamble = 'This message requires a mime aware email reader to be viewed correctly.\n' # Force ending in newline: main_part.epilogue = '' eH = email.Header.Header # The .encode() call here shouldn't be doing any encoding other # splitting the header onto multiple continuation lines, since we # are already providing eH with safely asciified strings. main_part['From'] = eH(_mimeify_address(_from)).encode() main_part['Subject'] = eH(_mimeify_unstructured(subject)).encode() for (header, value) in [('To', to),('Cc', cc), ('Bcc', bcc)]: if value is None: continue if isinstance(value, str): main_part[header] = eH(value).encode() else: main_part[header] = eH(', '.join(map(_mimeify_address, value))).encode() if message_id is not None: main_part['Message-ID'] = eH(message_id).encode() else: main_part['Message-ID'] = email.Utils.make_msgid() if references is not None: main_part['References'] = eH(', '.join(references)).encode() if in_reply_to is not None: main_part['In-Reply-To'] = eH(in_reply_to).encode() if date is not None: main_part['Date'] = eH(date).encode() else: main_part['Date'] = email.Utils.formatdate() # s = smtplib.SMTP() # print ">>>fnah" # s.connect(host='smtp.ox.ac.uk') # s.sendmail('one@tes.la', 'foo@tes.la', main_part.as_string()) # s.close() return main_part.as_string() def _mimeify_message(message, wrap_message, cols): if wrap_message: message = _wrap_text(message, cols) if _just_ascii(message): charset = 'us-ascii' else: charset = 'utf8' msg_part = email.MIMEText.MIMEText(_text=message.encode(charset), _subtype='plain', _charset=charset) msg_part.add_header('Content-Disposition', 'inline') return msg_part def _mimeify_attach_message(message_rfc822): message_rfc822 = email.message_from_string(message_rfc822) return email.MIMEMessage.MIMEMessage(message_rfc822, 'rfc822') def _mimeify_attach_file((filename_unicode, fh)): # fh = python file handle # Guess the content type based on file extension. content_type, encoding = mimetypes.guess_type(filename_unicode) if encoding == 'gzip': content_type = 'application/x-gzip' elif encoding == 'compress': content_type = 'application/x-gzip' elif encoding is not None: # we don't recognize the encoding: content_type = 'application/octet-stream' else: # encoding is None; we are safe to use the content_type # returned by mimetypes. pass # Check that mimetypes actually returned a content_type: if content_type is None: content_type = 'application/octet-stream' maintype, subtype = content_type.split('/', 1) if maintype == 'text': # This is what we should be doing: # msg_part = email.MIMEText.MIMEText(fh.read(), _subtype=subtype) # but until I gather together character encoding detection, # everything text is going to be attached as # application/octet-stream. msg_part = email.MIMEBase.MIMEBase('application', 'octet-stream') msg_part.set_payload(fh.read()) # Encode the payload using Base64 email.Encoders.encode_base64(msg_part) elif maintype == 'image': msg_part = email.MIMEImage.MIMEImage(fh.read(), _subtype=subtype) elif maintype == 'audio': msg_part = email.MIMEAudio.MIMEAudio(fh.read(), _subtype=subtype) else: msg_part = email.MIMEBase.MIMEBase(maintype, subtype) msg_part.set_payload(fh.read()) # Encode the payload using Base64 email.Encoders.encode_base64(msg_part) # Set the filename parameter msg_part.add_header('Content-Disposition', 'attachment') _set_filename(msg_part, filename_unicode) return msg_part def _mimeify_address(address): if isinstance(address, str): return address else: (name, email_addr) = address return email.Utils.formataddr((_mimeify_unstructured(name), email_addr)) def _set_filename(msg_part, filename_unicode): # Filename parameter of structured header gets rfc2231 encoded: if _just_ascii(filename_unicode): filename = filename_unicode.encode('us-ascii') msg_part.set_param(param='filename', value=filename, header='Content-Disposition') else: charset = 'utf8' filename = filename_unicode.encode('utf8') msg_part.set_param(param='filename', value=filename, header='Content-Disposition', charset=charset) def _mimeify_unstructured(string): if not isinstance(string, unicode): # Unstructured fields get RFC2047 encoded. return string elif _just_ascii(string): return string.encode('us-ascii') else: return str(email.Header.make_header([(string.encode('utf8'), 'utf8')])) def _just_ascii(unicode_string): # Are are the objects in the unicode string ascii character?: return unicode_string.encode('utf8') == unicode_string.encode('us-ascii', 'ignore') # Error classes. class _EmailPackageError(Exception): """ Private error that will only be thrown for suspected programming errors in the Python email package. """ class EZEmailError(Exception): pass class EZEmailParseError(EZEmailError): """ An emtpy parent class for all public errors in this module. """ def __init__(self, msg): """ """ self.basic_email_info = _basic_email_info(msg) Exception.__init__(self) class EZEmailCreateError(Exception): pass class _EZEmailPrivateError(Exception): """ An emtpy parent class for all private errors in this module. """ pass class _UnicodeDecodingError(_EZEmailPrivateError): """ This is a private error which can be raised if attempting to use the unicode builtin fails because the charset we try to decode from isn't recognized. """ def __init__(self, value, charset): """ Constructor takes single argument; a string giving the name of the problem charset. """ self.value = value self.charset = charset class _StructuredHeaderPairError(_EZEmailPrivateError): """ This is a private error which will be raised if there is an error trying to parse and rejoin a key/value pair from a structured header. """ def __init__(self, key, value): self.key = key self.value = value class HeaderRFC2231Error(EZEmailParseError): """ This error is raised if we can't decode a structured header (eg. Content-Type or Content-Disposition) successfully. """ def __init__(self, msg, header, header_value, key, key_value): self.header = header self.header_value = header_value self.key = key self.key_value = key_value EZEmailParseError.__init__(self, msg) class HeaderCharsetError(EZEmailParseError): """ This error is raised if we can't recognize one of the charsets used in a particular header. """ def __init__(self, msg, header, header_value, problem_part, charset): """ Constructor takes an email.Message message object and header, value and charset (in their original rfc2047 encoding) as arguments and stores them. """ self.header = header self.header_value = header_value self.problem_part = problem_part self.charset = charset EZEmailParseError.__init__(self, msg) def __str__(self): return "header: %s\nheader value: %s\nproblem part: %s\ncharset: %s" % (self.header, self.header_value, self.problem_part, self.charset) class HeaderRFC2047Error(EZEmailParseError): """ This error is raised if we can't parse the RFC2047 encoding used in a particular header. """ def __init__(self, msg, header, value): """ Constructor takes an email.Message message object and header, value and charset (in their original rfc2047 encoding) as arguments and stores them. """ self.header = header self.value = value EZEmailParseError.__init__(self) def __str__(self): return "\nheader: %s\nvalue: %s\ninfo: %s" % (self.header, self.value, self.basic_email_info) class FromHeaderParsingError(EZEmailParseError): """ We have a From: header we can't parse. """ def __str__(self): return "\ninfo: %s" % (self.basic_email_info) class FromHeaderMissingError(EZEmailParseError): """ Somehow we have recieved a seriously broken email with no From: header. Reject! """ pass class _ParseDateError(_EZEmailPrivateError): """ Private error raised when email.Utils.parsedate or email.Utils.parsedate_tz fails to parse a date header value. """ pass class ParseDateError(EZEmailParseError): """ Public error raised when email.Utils.parsedate or email.Utils.parsedate_tz fails to parse a date header value. """ pass class _CTEDecodingError(_EZEmailPrivateError): pass class _MultipartCTEDecodingAttempt(_EZEmailPrivateError): """ Raised if an attempt is made to CTE decode a multipart message part. """ pass class MIMEPartError(EZEmailParseError): def __init__(self, msg, mhe, error_type): self.maintype = mhe.maintype self.subtype = mhe.subtype self.filename = mhe.filename if mhe.decoded_payload is None or mhe.msg_part.is_multipart(): # If we haven't decoded payload successfully, take sample # from CTE encoded payload: self.sample = mhe.msg_part.get_payload()[0:100] else: # Otherwise, take sample from CTE decoded payload: self.sample = mhe.decoded_payload[0:100] if error_type in self.valid_error_types: self.error_type = error_type else: raise ValueError('Programming Error: error_type = \'' + error_type + '\' is not valid for MIME parts') EZEmailParseError.__init__(self, msg) valid_error_types = ['cte_decoding', 'filename_decoding', 'downgrading_to_text', 'unicode_conversion', 'not_implemented'] def __str__(self): return "maintype: %s\nsubtype: %s\nfilename: %s\nsample: %s\nerror_type: %s" % (self.maintype, self.subtype, self.filename, self.sample, self.error_type) class EZEmailCreateError(EZEmailError): pass if __name__ == "__main__": import sys # import profile def f(): for filename in sys.stdin.xreadlines(): print filename, filename = filename[:-1] contents(filename) print "===" a = ParseMessage(open(filename, 'rb').read(), strict=False) print a.primary_message() f() # profile.run('f()') - + diff --git a/modules/elmsubmit/lib/elmsubmit_EZEmail.py.wml b/modules/elmsubmit/lib/elmsubmit_EZEmail.py.wml deleted file mode 100644 index 57c071566..000000000 --- a/modules/elmsubmit/lib/elmsubmit_EZEmail.py.wml +++ /dev/null @@ -1,1992 +0,0 @@ -# -*- coding: utf-8 -*- - -## $Id$ - -## This file is part of the CERN Document Server Software (CDSware). -## Copyright (C) 2002, 2003, 2004, 2005 CERN. -## -## The CDSware is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License as -## published by the Free Software Foundation; either version 2 of the -## License, or (at your option) any later version. -## -## The CDSware is distributed in the hope that it will be useful, but -## WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -## General Public License for more details. -## -## You should have received a copy of the GNU General Public License -## along with CDSware; if not, write to the Free Software Foundation, Inc., -## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - -""" -# Side note: CJK codecs at http://cjkpython.i18n.org/. - -Exports blah blah blah. - -Speed: - -Testing it on a random sample of 1500 messages culled from my INBOX, -it took an average of 5/100ths seconds to process each -message. (Running on a Linux P4 2Ghz machine). - -Shortcomings: - -- Does not support message/partial mime type. - - The message/partial mime type is designed to allow mailers to split - up the body of large messages into several 'message/partial' parts, - which can then be sent inside seperate email messages to the - intended receipient (see RFC2046 for the precise semantics). Upon - receipt the MUA is supposed to recombine the parts into the orignal - message. Supporting this fairly rare MIME type would add a lot of - complexity to the module; there would have to be a mechanism for - passing several email messages to the Message class constructor; further, - -- Simply deletes RFC2231 language specification extensions from - RFC2047 encoded header text. The rest of RFC2231 relating to header - parameter values is observed. Note that a language specfication is - different from a charset. This is an issue in very few - circumstances. Hopefully it won't be an issue at all eventually, - because soon charsets should allow language specifications to be - built into them, and whatever happens in the future to unicode (and - accordingly python's unicode object and function) will adapt to - this. Quoting from RFC2231: - - > 8. Character sets which allow specification of language - > - > In the future it is likely that some character sets will provide - > facilities for inline language labeling. Such facilities are - > inherently more flexible than those defined here as they allow for - > language switching in the middle of a string. - > - > If and when such facilities are developed they SHOULD be used in - > preference to the language labeling facilities specified here. Note - > that all the mechanisms defined here allow for the omission of - > language labels so as to be able to accommodate this possible future - > usage. - -""" - -# Quicker to do "from email import *" but this gives us a reminder of -# what's available: - -import email - -import email.Message -import email.Parser -import email.Generator -import email.Header -import email.Charset -import email.Encoders - -import email.MIMENonMultipart -import email.MIMEMultipart -import email.MIMEMessage -import email.MIMEText -import email.Utils - -import email.Errors - -import mimetypes - -# Non email imports: - -import time -import datetime -import os - -import StringIO -import quopri -import uu -import base64 - -import re - -import rtf.Rtf2Txt -from rtf.RtfParser import RtfException as _RtfException - -import cdsware.elmsubmit_richtext2txt as _richtext2txt -import cdsware.elmsubmit_enriched2txt as _enriched2txt -import cdsware.elmsubmit_html2txt as _html2txt - -from cdsware.elmsubmit_misc import concat as _concat -from cdsware.elmsubmit_misc import cr2lf as _cr2lf -from cdsware.elmsubmit_misc import random_alphanum_string as _random_alphanum_string -from cdsware.elmsubmit_misc import wrap_text as _wrap_text - -from cdsware.elmsubmit_filename_generator import generate_filename as _generate_filename - -import cdsware.elmsubmit_EZArchive as elmsubmit_EZArchive - -# Message Parsing: - -# (Search down to "# Message Creation:" also.) - -_default_handling_hints = { 'generate_filename' : 'always', - 'descend_message_rfc822_attachments' : True, - 'archive_format' : 'tar.gz', - 'archive_multipart_unrecognized': True, - 'archive_multipart_parallel': True, - 'archive_multipart_related': True, - 'generate_filename' : 'if_missing' } - -class ParseMessage(object): - - """ - This class provides a very simple representation of an email - message. It is basically a wrapper around the class ParseMessage - provided by the Email package that comes with Python. - - It is designed to give simple access to the body text, attachments - etc. without the programmer having to consider the evil - complexities of MIME. It READ ONLY. ie. Don't expect to produce a - new email by mutating the data returned by the class methods! - - Instance properties: - - self.headers - - Returns the headers of the message as a Python dictionary - object. - - The data structure _headers might look as follows: - - _headers = { 'to' : [u"alice@example.com, bob@example.org"], - 'recieved' : [u"from cernfe02.cern.ch ([137.138 etc...", - u"from client.cern.ch ([137.138. etc..."], - 'date' : [u"Wed, 4 Aug 2004 15:07:17 +0200 (W. Europe Daylight Time)"] } - - ie. It is a python dictionary: the keys are headers and the - values are lists of header values (note that an email - (RFC822) message may contain duplicate headers). Each list - contains the header values in the relative order they appear - in the original message. rfc822 headers are case-insensitive - so we store them in lowercase. - - Header processing decodes each header from its RFC2047 encoded - format then further decodes it from the native charsets into - unicode. Hence the list values in the data structure are - unicode strings. Note that the _keys_ are just regular ascii - strings, since we should be able to rely on header keys being - 7-bit ascii only. - - self.received_data - self.primary_message - - Returns the best guess at what the intended message is. - - self.original_message - - Returns the orignal message as supplied to the constructor. - - self.attachments - self.inline_attachments - self.from_header - self.subject - self.from_name - self.from_email - self.message_id - self.date_sent_utc - - Returns an ISO8601 formatted unicode string of the date the - email was sent (in UST/GMT). ISO8601 strings look like this: - "YYYY-MM-DD HH:MM:SS". If the Date: header is not present or - unparsable, the current time on the system (in GMT) is - substituted instead. - """ - - def __init__(self, message_string, strict=True, hints=_default_handling_hints): - - # message_string is a string representing an email (aka rfc822 - # message). strict determines +++++++++++++ - - # Save the original message string (can be accessed later by - # self.origMessage method) - - self.original_message = message_string - - # Create an email.Message object from the plain text. - - msg = email.message_from_string(message_string) - - # Now populate self.headers; intended to be accessed later by - # self.headers method. The data structure is described in the - # .headers method docstring. - - self.headers = _process_headers(msg, msg.items(), force_processing=(not strict)) - - # Now we move on to calculating _from_name, _from_email - - # Of course, there might not be a From: header in a very - # broken email, so we raise a FromHeaderError if this is the - # case. - - try: - # KeyError means email is missing 'from' field: - from_header = self.headers['from'][0] # If mutliple From: fields, use 1st. - - self.from_addr = from_header - - # from_header could be None if we are operating with - # strict=False and we failed to decode the header: - if from_header is None: raise FromHeaderParsingError(msg) - - (from_name, from_email) = _parse_from_header(msg, from_header) - except KeyError: - if strict: - raise FromHeaderMissingError(msg) - else: - from_name = None - from_email = None - except FromHeaderParsingError: - if strict: - raise # Reraise the error. - else: - from_name = None - from_email = None - - self.from_name = from_name - self.from_email = from_email - - try: - self.subject = self.headers['subject'][0] - except KeyError: - self.subject = '' # Should we put None here? - - try: - self.message_id = self.headers['message-id'][0] - except KeyError: - self.message_id = '' # Should we put None here? - - # Process the received headers, extracting the 'received from' host and ip address: - - try: - self.received_data = map(_received_ip_and_host, self.headers['received']) - except KeyError: - # There were no recieved headers; should this be an error - # in strict mode or not? - # I think not, since people can save email locally without sending it. - self.received_data = None - - # Now calculate _date_sent_utc. Test to see if there actually - # is a date header and if we can parse it. If running in - # strict mode, then we throw an error. If not, then simply use - # the localtime. - - try: - date_in_rfc2822_format = self.headers['date'][0] - remote_struct_time_with_utc_offset = email.Utils.parsedate_tz(date_in_rfc2822_format) - - # email.Utils.parsedate_tz returns None on failure. - if remote_struct_time_with_utc_offset is None: raise _ParseDateError() - - remote_struct_time = remote_struct_time_with_utc_offset[0:9] - (remote_offset_from_utc_in_seconds,) = remote_struct_time_with_utc_offset[9:10] - - if remote_offset_from_utc_in_seconds is None: raise _ParseDateError() - except (KeyError, _ParseDateError): - - if strict: raise ParseDataError(msg) - - else: - # Use local time on error. - remote_struct_time = time.gmtime() - remote_offset_from_utc_in_seconds = 0 - - date_time_args = remote_struct_time[0:6] # datetime constructor only needs first 6 parts of struct_time tuple - -# filter(lambda x: x is None: date_time_args) -# if filter != []: raise ParseDateError(msg) - - remote_time = datetime.datetime(*date_time_args) - remote_utc_delta = datetime.timedelta(seconds=remote_offset_from_utc_in_seconds) - -# local_utc_delta = datetime.timedelta(seconds= -time.timezone) - - utc_time = remote_time - remote_utc_delta -# local_time = utc_time + local_utc_delta - - # Now that we have the date sent in utc, we just format it to - # an ISO8601 string and convert that to a unicode - # object. Since the ISO8601 string will only contain us-ascii, - # this conversion should not fail and so we need not check for - # decoding errors. - - self.date_sent_utc = unicode(utc_time.isoformat(sep=' '), 'us-ascii') - -# self._date_sent_local = local_time.isoformat(sep=' ') - - # Now we parse the email and attempt to calculate what the - # primary message (ie. what would pop-up in the message pane - # of your email client) would be. - - (self.attachments, - self.inline_attachments, - self.primary_message) = _get_msg_structure(msg, strict, hints) - - -def contents(filename): - - f = file(filename, "r") - - p = email.Parser.Parser() - msg = p.parse(f) - - def walk_email(msg,indent=0): - - print "-"*indent, msg.get_content_type() - - if msg.is_multipart(): - for part in msg.get_payload(): - walk_email(part,indent+8) - - walk_email(msg) - - f.close() - -####### __main__ - -# f = open('blower.eml','r') -# e = f.read() - -# f = open('testSpliter/torture-test.eml','r') -# tort = f.read() - - -# f = open('just_text.eml','r') -# e2 = f.read() - -# f = open('hello.eml','r') -# e3 = f.read() - -# f = open('rtf2.eml') -# e4 = f.read() - -# f = open('attached_msg.eml') -# e5 = f.read() - -# f = open('/tmp/eg/example.jpg','r') -# jpg = f.read() - -# f = open('/tmp/eg/example.msword.doc','r') -# word = f.read() - -# f = open('/tmp/eg/example.xls','r') -# excel = f.read() - -# f = open('/tmp/eg/example.pdf','r') -# pdf = f.read() - -# f = open('/tmp/eg/example.reg.gz','r') -# reg = f.read() - -# f = open('/tmp/eg/example.wk3','r') -# lotus = f.read() - -# f = open('/tmp/eg/example.xml','r') -# xml = f.read() - -# f = open('/tmp/eg/example.tar.gz','r') -# targz = f.read() - -# f = open('/tmp/eg/example.tar','r') -# tar = f.read() - -# f = open('/tmp/eg/example.zip','r') -# zip_data = f.read() - -# f = open('/tmp/eg/example.xml.bz2','r') -# bz2eg = f.read() - - -# Support functions. - -def _received_ip_and_host(received_header): - - host_re = re.compile(r"""from\ ( # from marks the start of the received target - [a-z0-9]+(?:[a-z0-9_.-]+[a-z0-9])? # Match a domain string. - ) # Allow illegal but common underscores - # (eg. the famous dear_raed.blogspot.com) - [)\s] # Terminate with space or a closing bracket depending on the format. - """,re.VERBOSE|re.IGNORECASE) - - ipad_re = re.compile(r"""[[(] # match opening bracket or parenthesis - # (should be a bracket if following standards) - ((?:\d{1,3}\.){3} # match three octets with dots - \d{1,3}) # match a single octet with no dot - [])] # match the closing bracket/parenthesis - """, re.VERBOSE|re.IGNORECASE) - - host_match = host_re.search(received_header) - if host_match is not None: - host = host_match.group(1) - else: - host = None - - ipad_match = ipad_re.search(received_header) - if ipad_match is not None: - ipad = ipad_match.group(1) - else: - ipad = None - - return (host, ipad) - -def _basic_email_info(msg): - - """ - Takes an email.Message object and returns a dictionary, formatted - like the following example, containing a basic subset of - information about the message: - - { from: u'Ann Other ' - from_email : u'person@example.org', - from_name : u'Ann Other', - subject : u'This email is about...', - message-id : u'1234567890@host.example.com', } - - Any header which cannot be decoded to unicode will be returned in - it original encoded form. Check with type(value) = unicode. - - This function can be used when throwing an error to gather just - enough information about the message so clients of the - elmsubmit_EZEmail.ParseMessage class can respond to the email author - reporting the error. - """ - - # Items we want to try and return: - # If you wish to tailor this list, note that basic_headers MUST - # have 'from' in it, otherwise the following code breaks! - - basic_headers = ['from', 'subject', 'message-id'] # 'from_name' and 'from_email' aren't headers; - # they are derivatives of the 'from' header. - - # The hash to be built up and returned: - - return_dict = {} - - # Get all header/value pairs for which the header is also in list - # basic_headers (case insensitively): - - f = lambda (k,v): k.lower() in basic_headers - basic_items = filter(f, msg.items()) - - # Now attempt to decode the basic headers to unicode objects: - - basic_decoded_headers = _process_headers(msg=None, header_value_pairs=basic_items, force_processing=True) - - # Since we're just using this for error output, we don't need to - # worry about headers with the same header key; just accept the - # first one present (and note that the list of headers in - # basic_headers are all ones which _should_ only be appearing - # once). - - g = lambda (k,v): (k,v[0]) - basic_decoded_headers = dict(map(g, basic_decoded_headers.items())) - - try: - - # If the from header is missing this access will cause - # KeyError: - - from_value = basic_decoded_headers['from'] - - # If from_header is None, we couldn't decode it and so can't - # proceed in splitting it into from_name and from_email. Raise - # TypeError. - - if from_value is None: raise TypeError - - # Could cause FromHeaderParsingError: - (from_name, from_email) = _parse_from_header(msg, from_value) - - return_dict.update({ 'from_name': from_name, - 'from_email': from_email }) - - except (TypeError, KeyError, FromHeaderParsingError): - return_dict.update({ 'from_name': None, - 'from_email': None }) - - # This loops over basic_headers and tries to index - # basic_decoded_headers by each value. Anything that isn't present - # (ie. we've failed to decode), we look up directly in the orginal - # msg object (and return the value as a string in whatever charset - # and RFC2047 encoding it arrived in) if _that_ fails (ie. the - # header is missing from the message altogether) we set the value - # in the hash to None. - - for header in basic_headers: - value = basic_decoded_headers.get(header, None) - - if value == None: - value = msg.get(header, None) - - return_dict[header] = value - - return return_dict - -def _native2unicode(value_nc, native_charset=None, strict=True): - - """ - Function native2unicode is a wrapper around builtin function - unicode. The difference is that native2unicode will accept a - charset of None which will cause it to default to decoding from - us-ascii. - - It also raises a custom error _UnicodeDecodingError which returns - the problem value and charset, rather than LookupError/ValueError - raised by the unicode builtin. - """ - errors = { True : 'strict', - False: 'replace'}[strict] - - # Non-RFC2047 encoded parts return charset as None; we assume then - # that they are us-ascii bytes. - - if native_charset is None: native_charset = 'us-ascii' - - # Remove RFC2123 language specification from document if present. - # This is delimited from the charset by the '*' character. eg. We - # might have - # native_charset = 'us-ascii*en' - # and we need to remove '*en'. - - # This is the key reason we have function _native2unicode, and - # aren't just calling .decode(charset)! - - native_charset = re.sub(r'\*.*$', '', native_charset) - - # Search this document for RFC2123 for more information. - - # unicode function might not recognize the native_charset and - # hence throw a LookupError. Or it might fail to do the conversion - # and throw a UnicodeError - - try: - return unicode(value_nc, native_charset, errors) - except (LookupError, UnicodeError): - raise _UnicodeDecodingError(value_nc, native_charset) - -def _process_headers(msg, header_value_pairs, force_processing=False): - - f = lambda headers, (header,value): _process_header(msg, headers, header, value, force_processing) - return reduce(f, header_value_pairs, {}) - -def _decode_rfc2231_tuple(msg, value): - - try: - try: - (charset, lang_specification, encoded_value) = value - - # If charset is unspecified, then we can assume us-ascii: - if charset == '': charset = 'us-ascii' - return _native2unicode(encoded_value, charset, strict=True) - except ValueError: - - # Data was not RFC2231 encoded. ie. value should be just an - # ascii-string. - - # Note however that some broken email clients mistakenly use - # RFC2047 encoding in parameterized header fields such as - # Content-Type and Content-Disposition. This is disallowed by - # RFC2047: - - # > + An 'encoded-word' MUST NOT be used in parameter of a - # > MIME Content-Type or Content-Disposition field, or in any - # > structured field body except within a 'comment' or - # > 'phrase'. - - # In order to support these clients, if we get a string back - # which wasn't RFC2231 encoded, we check instead to see if it - # can be RFC2047 decoded. - - # Note that header='rfc2231_param' is just a dummy value; - # we're not really decoding an rfc2047 encoded header; - # just trying to support clients that mistakenly use - # rfc2047 encoding for parameters _within_ structured - # headers. - - return _decode_rfc2047_header(msg, header='rfc2231_param', value=value) - except (_UnicodeDecodingError, HeaderCharsetError, HeaderRFC2047Error): - return None - -def _decode_and_join_structured_header_pair(msg, key, value): - - # Take input that looks like this: - # - # key = 'title' - # value = ('us-ascii', 'en', "This is even more ***fun*** isn't it!") - # - # And return it looking like this: - # - # u'title="This is even more ***fun*** isn't it!"' - - # The key should always be just a us-ascii string: - - try: - decoded_key = _native2unicode(key, 'us-ascii') - except _UnicodeDecodingError: - raise _StructuredHeaderPairError(key, value) - - if value == '': - # We have a structured entry that is not in key=value form. eg. The multipart/mixed in - # 'Content-Type: multipart/mixed; boundary="------------050902070901080909090201"' - - return decoded_key - - else: - decoded_value = _decode_rfc2231_tuple(msg, value) - if decoded_value is None: raise _StructuredHeaderPairError(key, value) - - # Now escape string for addition of quotes either side: - # Escape backslashes: - decoded_value = re.sub(r'\\', r'\\\\', decoded_value) - # Escape quotes: - decoded_value = re.sub(r'"', r'\\"', decoded_value) - - return decoded_key + '="' + decoded_value + '"' - -def _decode_rfc2231_header(msg, header, value, force_processing=False): - - # We get the key/value pairs from the structured header by calling - # the email.ParseMessage class's get_params method. This method deals - # with all of the RFC2231 decoding for us, and so we just have to - # reconstruct the tuples it gives us into a unicode string. - - # This means these headers are no longer suitable for parsing by - # machine, but does make them suitable for display (which is - # prefered from two mutually incompatable options; if you want to - # start parsing Content-Type parameters, then you want to be using - # the Python email package directly!). - -# Take a value that looks like: -# value = -# And turn it into a unicode string that looks like: -# u'"This is even more ***fun*** isn't it!"' - -# The values in the tuple are from left to right are a charset, -# language specification and string of encoded text. - -# We ignore the language specification. See list of module -# shortcomings. -# """ - - params = msg.get_params(None, header) - - # param should never return failobj None since we have already - # verified the header we are requesting exists. - if params is None: raise _EmailPackageError - - try: - - f = lambda (k,v): _decode_and_join_structured_header_pair(msg, k, v) - joined_pairs = map(f, params) - unicode_value = '; '.join(joined_pairs) - - except _StructuredHeaderPairError, e: - if force_processing: - unicode_value = None - else: - raise HeaderRFC2231Error(msg, header, value, e.key, e.value) - - return unicode_value - -def _decode_rfc2047_header(msg, header, value, force_processing=False): - """ - Take an rfc2047 encoded string and convert it to unicode. - """ - - # For each header value two decoding steps happen: - # 1. We decode the header from its RFC2047 encoded format. - # 2. We decode the resulting information from its native - # charset to a unicode object. - - # decode_header takes the RFC2047 encoded form and returns a list - # of tuples (string_in_native_charset, charset_name). It is a - # *list* not a single tuple, since it is permissible to use - # multiple charsets in a single header value! - - # Although undocumented in the python library documentation, - # looking at the email.Header source suggests decode_header might - # raise an 'email.Errors.HeaderParseError'. We catch this and - # raise our own 'HeaderRFC2047Error'. - - try: - decoded_parts = email.Header.decode_header(value) - - # The _native2unicode function might not recognise one of the - # charsets and so throw a private _UnicodeDecodingError. If we - # get one, then we catch it and raise public error - # "HeaderCharsetError". - - f = lambda (value, charset): _native2unicode(value, charset, not force_processing) - unicode_decoded_parts = map(f, decoded_parts) - - # Since all members of decoded_parts are now in unicode we can - # concatenate them into a single header value string. - - unicode_value = u''.join(unicode_decoded_parts) - - except email.Errors.HeaderParseError: - if force_processing: - unicode_value = None - else: - raise HeaderRFC2047Error(msg, header, value) - except _UnicodeDecodingError, e: - if force_processing: - unicode_value = None - else: - raise HeaderCharsetError(msg, header, value, e.value, e.charset) - - return unicode_value - -def _process_header(msg, headers, header, value, force_processing=False): - - # Function _process_header takes a partial headers dictionary - # and a header/value pair and updates the dictionary with this - # pair. - - # Headers are decoded from their RFC2231 encoding and turned into - # unicode strings. - - # For Content-Type and Content-Disposition headers only, an - # alternative decoding step happens; we attempt to decode these - # structured headers from their RFC2231 encoding and rebuild them - # as unicode strings. - - if header.lower() in ('content-type', 'content-disposition'): - unicode_value = _decode_rfc2231_header(msg, header, value, force_processing) - else: - unicode_value = _decode_rfc2047_header(msg, header, value, force_processing) - - # Repeated header keys are legal, so we store dictionary - # values as a list. Therefore we must check if this header - # key has already been initialized in the dictionary. - - header = header.lower() - headers.setdefault(header, []) # If key header isn't present, add it with value [] - headers[header].append(unicode_value) - - return headers - -def _parse_from_header(msg, from_header): - - ### !!! Need to do some thinking about internationalized email - ### !!! addresses and domain names to check what problems - ### !!! these may cause. - - (from_name, from_email) = email.Utils.parseaddr(from_header) - - # Check we were able to parse the From: field - # (email.Utils.parseaddr returns ('','') on failure) and that - # from_email is not empty. Otherwise raise a FromHeaderParsingError - - # empty from_name is OK, since we just use from_email as the - # author's 'name'. - - if (from_name, from_email) == ('','') or from_email == '': - raise FromHeaderParsingError(msg) - elif from_name == '': - from_name = from_email - - return (from_name, from_email) - -def _get_msg_structure(msg, strict, hints): - - mime_helper = _MimeHelper(msg) -# mime_helper.maintype = 'multipart' -# mime_helper.subtype = 'parallel' - mime_handler = _get_mime_handler(mime_helper.maintype, mime_helper.subtype) - - nominal_attachments = mime_handler(msg, mime_helper, strict, hints) - attachments = [] - inline_attachments = [] - primary_msg = '' - - for item in nominal_attachments: - - if item['disposition'] != 'attachment': - if item['maintype'] == 'text' and item['downgrading_to_text'] is None: - primary_msg += _force_ends_in_newline(item['file']) - inline_attachments.append(item) - elif item['downgrading_to_text'] is not None: - primary_msg += _force_ends_in_newline(item['downgrading_to_text']) - inline_attachments.append(item) - else: - attachments.append(item) - else: - attachments.append(item) - - return (attachments, inline_attachments, primary_msg) - -def _force_ends_in_newline(string): - - if string == '' or string[-1] != '\n': - return string + '\n' - else: - return string - -def _get_mime_handler(maintype, subtype): - - try: - handler = _mime_handler_map[maintype][subtype] - except KeyError: - try: - handler = _mime_handler_map_unrecognized_subtype[maintype] - except KeyError: - handler = _mime_handler_unrecognized_maintype - - # Create a 'wrapper' function which does preparatory checks we - # want to happen for all mime type before executing the real mime - # handler (possibly we could have used a class based approach to - # allow for more levels of wrapping, but I think this may have - # been a sledgehammer on nut): - - def parent_handler(msg, mhe, strict, hints): - - if mhe.decoded_payload is None: # and not mhe.msg_part.is_multipart(): - if strict: - raise MIMEPartError(msg, mhe, 'cte_decoding') - else: - return [] - - if mhe.filename == ['FilenameDecodingError']: - if strict: - raise MIMEPartError(msg, mhe, 'filename_decoding') - else: - mhe.filename = None - - if maintype == 'text': - - # Make sure text data has unix newline conventions. - mhe.decoded_payload = _cr2lf(mhe.decoded_payload) - - try: - mhe.file = _native2unicode(mhe.decoded_payload, mhe.charset, strict) - except _UnicodeDecodingError: - if strict: - raise MIMEPartError(msg, mhe, 'unicode_conversion') - else: - return [] - return handler(msg, mhe, strict, hints) - - return parent_handler - -# Generate filename values: never, always, if_missing - -def _format_msg_part_data(mhe, hints): - - part_info = {} - part_info['file'] = mhe.file #'UNCOMMENT THIS TO SEE FILE!!!!!' #mhe.file - part_info['downgrading_to_text'] = mhe.downgrading_to_text - part_info['maintype'] = mhe.maintype - part_info['subtype'] = mhe.subtype - part_info['filename'] = mhe.filename - part_info['disposition'] = mhe.disposition - if mhe.maintype == 'text': - part_info['original_charset'] = mhe.charset - part_info['signature'] = mhe.signature - part_info['encrypted'] = mhe.encrypted - part_info['mac_resource_fork'] = mhe.mac_resource_fork - part_info['rejected_alternatives'] = mhe.rejected_alternatives - - # Now see if we need to generate a filename: - gf = hints['generate_filename'] - - if gf == 'always' or (gf == 'if_missing' and (mhe.filename is None or mhe.filename == '')): - generated_filename = _generate_filename(file=mhe.decoded_payload, content_type=(mhe.maintype + '/' + mhe.subtype)) - else: - generated_filename = None - - part_info['generated_filename'] = generated_filename - return part_info - -def _get_part_disposition(msg_part): - """ - Look to see whether this part is designated as inline, attachment - or something else. - """ - - # BNF of Content-Disposition header quoted from RFC2183: - - # disposition := "Content-Disposition" ":" - # disposition-type - # *(";" disposition-parm) - # - # disposition-type := "inline" - # / "attachment" - # / extension-token - # ; values are not case-sensitive - - # The BNF states only "inline" or "attachment" are valid tokens - # for disposition-type, so there is now need to worry about - # RFC2231 encoded data. (And any extension tokens should be - # similarly restricted to simple ascii). - - # This dictates that the disposition-type must be the first element in - # the header. get_params returns something like this: - - # >>> msg_part.get_params(None, 'Content-Disposition') - # [('inline', ''), ('filename', 'email.txt')] - - # So we have to index by [0][0] to get the disposition-type keyword. - - try: - return msg_part.get_params(None, 'Content-Disposition')[0][0] - except (TypeError, IndexError): - return None - -def _get_part_filename(msg_part): - - """ - Attempt to discover a filename associated with a message body - part. - - Note that the filename, if it exists, is returned as a unicode - string. Filenames may not be as simple as you expect; what you get - may be a string of Arabic characters. - """ - - # Note, we could just use the email.ParseMessage method get_filename to - # try and discover a filename. However, this only checks the - # Content-Disposition header for the filename parameter whereas we - # would like to support crufty old clients which are still using - # the Content-Type name parameter. - - missing = [] - - #First try content-disposition: - filename = msg_part.get_param('filename', missing, 'content-disposition') - if filename != missing: - - filename = _decode_rfc2231_tuple(msg_part, filename) - if filename is None: - return ['FilenameDecodingError'] - else: - return filename - else: - - # If filename parameter of content-disposition is not - # available, try name parameter of content-type: - filename = msg_part.get_param('name', missing, 'content-type') - - if filename != missing: - filename = _decode_rfc2231_tuple(msg_part, filename) - if filename is None: - return ['FilenameDecodingError'] - else: - # No filename available: - return None - -def _cte_decode(msg_part): - - """ - Return a message part's payload, decoded from its content - transfer encoding. - """ - - # Note that it is possible to use - # msg_part.get_payload(decode=True) to do the CTE - # decoding. Unfortunately, the error reporting of this method is - # not very helpful; if CTE decoding fails, it just returns the - # undecoded payload. This makes it hard to tell if there has been - # success or not. Initially I thought decoding failure could be - # identified by checking if: - - # msg_part.get_payload(decode=True) == msg_part.get_payload(decode=False) - - # But this method would flag false errors. For example, if text - # contains no 'nasty characters' it will be the same both before - # and after quoted-printable encoding (see the quoted-printable - # RFC for a definition of 'nasty characters!); ie. for such text - # quoted-printable encoding is an identity map. - - # The following function is essentially a cut-and-paste of the - # Email.Message.Message class's get_payload method, edited to - # raise _CTEDecodingError upon decoding failure. - - # In the case we are passed a multipart message part, we cannot - # decode it and so throw _MultipartCTEDecodingAttempt: - if msg_part.is_multipart(): raise _MultipartCTEDecodingAttempt - - payload = msg_part.get_payload(decode=False) - cte = msg_part.get('content-transfer-encoding', '').lower() - - try: - - if cte == 'quoted-printable': - # Could cause binascii.Error/Incomplete - payload = quopri.decodestring(payload) - elif cte == 'base64': - # Could cause - payload = _base64decode(payload) - elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'): - sfp = StringIO() - - uu.decode(StringIO(payload+'\n'), sfp) - payload = sfp.getvalue() - - except (binascii.Error, binascii.Incomplete, uu.Error): - raise _CTEDecodingError - - return payload - -def _base64decode(s): - """ - Decode bas64 encoded string. - """ - - # This is a cut and paste of email.Utils._bdecode. We can't call - # _bdecode directly, because its a private function of the Utils - # modules and therefore not safe to import. - - # We can't quite use base64.encodestring() since it tacks on a "courtesy - # newline". Blech! - if not s: - return s - value = base64.decodestring(s) - if not s.endswith('\n') and value.endswith('\n'): - return value[:-1] - return value - -# MIME Handlers: - -def _get_flattened_payload(msg_part, with_mime_headers=False): - - flattened_data = msg_part.as_string(unixfrom=False) - - # if with_mime_headers is False, then remove them: - if not with_mime_headers: - # Regex should remove from the start of the string up to the - # first double newline '\n\n', with possibly space in-between - # the newlines. This should chop of the mime_headers. - - # (?s) in the regex sets the DOTALL flag; ie. '.' matches everything including newline. - flattened_data = re.sub(r'(?s)(.*?\n\n)', r'', flattened_data, count=1) - - return flattened_data - -def _email_structure_to_directory_structure(email_structure, directory_base=''): - - files = [] - used_random = {} - alt_part_number = 0 - - for item in email_structure: - try: - # Is it a list? (ie. a list of alternative parts): - item.append # Does this throw AttributeError? If not, it is list like. - alt_part_number += 1 - files.extend(_email_structure_to_directory_structure(email_structure=item, - directory_base=os.path.join(directory_base, - ('alternative_data_' + str(alt_part_number))))) - except AttributeError: - # Or a dictionary? (ie. an actual part): - possible_filenames = [item['filename'], - item['generated_filename'], - 'unamed_part_' + _get_unused_random(lambda: _random_alphanum_string(length=8), used_random)] - available_filenames = filter(lambda x: x is not None, possible_filenames) - filename = available_filenames[0] - - if item['file'] is not None: - files.append((os.path.join(directory_base, filename), item['file'])) - else: - files.append((filename, '')) - - if item['downgrading_to_text'] is not None: - files.append((os.path.join(directory_base, filename + '.txt'), item['downgrading_to_text'])) - - return files - -def _archive_part(msg, mhe, strict, hints): - - processed_parts = _get_mime_handler('multipart', 'mixed')(msg, mhe, strict, hints) - files = _email_structure_to_directory_structure(processed_parts) - - mhe.file = elmsubmit_EZArchive.create(files, input_disposition='named_byte_strings', - compress_to='byte_string', - compression=hints['archive_format'], - force_file_permissions=0664, - force_dir_permissions=0775) - - mhe.filename = '_'.join([_random_alphanum_string(length=8), mhe.maintype, mhe.subtype, - 'archive.' + hints['archive_format']]) - -def _pick_from_alternatives(processed_parts): - - processed_parts.sort(multipart_alternative_sort) - return (processed_parts[0], processed_parts[1:]) - -def multipart_alternative_sort(part1, part2): - - # We deal with multipart/alternative by prefering in descending - # order: - # text/plain, score 6 - # text/html, score 5 - # text/enriched, score 4 - # text/richtext, score 3 - # text/rtf, score 2 - # application/rtf, score 1 - - # (the later five we make use of their 'downgrading_to_text') - - # Although text/richtext is a simpler format than text/html, it - # has theoretically been obsoleted by text/enriched and so comes - # lower in order of preference. - - # A note on the rich text mess: "Why are their four types of rich - # text? Surely they're all the same thing?" Unfortunately not... - - # - RFC1341 defines a simple text markup for mime type - # 'text/richtext'. - - # - RFC1896 (and some RFCS before it which 1896 obsoletes) defines - # 'text/enriched' which is designed to solve the shortcomings of - # 'text/richtext'; use of 'text/richtext' is deprecated in - # favour of 'text/enriched' - - # - 'text/rtf' and 'application/rtf' refer to Microsoft's RTF file - # format, and are not specified in any RFC (that I know of). They - # are the same file format; it's just that the registration got - # duplicated (people weren't sure whether to describe rtf as a - # plaintext format; ie. readable when unparsed by humans, or - # application (ie. needs to be parsed to make any sense of)! - - # Some useful reading: - - # http://mango.human.cornell.edu/kens/etf.html (text/enriched primer) - # http://www.faqs.org/rfcs/rfc1896.html (text/enriched RFC) - # http://www.faqs.org/rfcs/rfc1341.html (text/richtext RFC) - # News message ID: <199306081944.AA13622@mudhoney.micro.umn.edu> - # (the thread this message sits in contains the - # registrations of text/rtf and application/rtf) - - liked_formats = [ 'text/plain', - 'text/html', - 'text/enriched', - 'text/richtext', - 'text/rtf', - 'application/rtf' ] - - scorecard = dict(zip(liked_formats, range(len(liked_formats), 0, -1))) - - # Create something that looks like this: - - # {'application/rtf': 1, - # 'text/enriched': 4, - # 'text/html': 5, - # 'text/plain': 6, - # 'text/richtext': 3, - # 'text/rtf': 2} - - # Doing the calculation instead of hardcoding allows liked_formats - # to be rearranged more easily! - - # Part types not in this list get score 0. - - score1 = scorecard.get(part1['maintype'] + '/' + part1['subtype'], 0) - score2 = scorecard.get(part2['maintype'] + '/' + part2['subtype'], 0) - # We want the list in reverse order, big down to small: - return cmp(score2, score1) - -def _get_unused_random(rand_function, used_random): - - r = rand_function() - - while used_random.has_key(r): - r = rand_function() - - used_random[r] = True - return r - -class _MimeHelper(object): - - def __init__(self, msg_part): - - self.msg_part = msg_part - self.maintype = msg_part.get_content_maintype() - self.subtype = msg_part.get_content_subtype() - if self.maintype == 'text': - self.charset = msg_part.get_content_charset('us-ascii') - else: - self.charset = None - self.disposition = _get_part_disposition(msg_part) - self.filename = _get_part_filename(msg_part) - self.signed = False - self.signature = None - self.encrypted = False - self.mac_resource_fork = None - self.downgrading_to_text = None - self.rejected_alternatives = None - - if msg_part.is_multipart(): - # If multipart, get the flattened payload. - self.decoded_payload = _get_flattened_payload(msg_part, with_mime_headers=False) - else: - # If its not multipart, attempt CTE decoding and store - # result: - try: - self.decoded_payload = _cte_decode(msg_part) - except (_CTEDecodingError, _MultipartCTEDecodingAttempt): - self.decoded_payload = None - -def _mime_handler_application_applefile(msg, mhe, strict, hints): - return [{mhe.maintype : 'not implemented yet'}] - -def _mime_handler_application_octetstream(msg, mhe, strict, hints): - - # application/octet-stream requires no special handling. All of - # the necessary work has been done in the parent handler. - - mhe.file = mhe.decoded_payload - return [ _format_msg_part_data(mhe, hints) ] - -def _mime_handler_application_pgpencrypted(msg, mhe, strict, hints): - return [{mhe.maintype : 'not implemented yet', mhe.subtype : 'problems!'}] -def _mime_handler_application_pgpkeys(msg, mhe, strict, hints): - return [{mhe.maintype : 'not implemented yet', mhe.subtype : 'problems!'}] -def _mime_handler_application_pgpsignature(msg, mhe, strict, hints): - return [{mhe.maintype : 'not implemented yet', mhe.subtype : 'problems!'}] -def _mime_handler_application_rtf(msg, mhe, strict, hints): - - # application/rtf is same as text/rtf, so call _get_mime_handler - # to retrieve correct handler. - - # Note that we can't just execute _mime_handler_text_rtf directly, - # because this would miss the neccessary parent_handler code, - # which depends on knowing if maintype is 'text' (which the - # misregistered application/rtf would hide): - return _get_mime_handler('text', 'rtf')(msg, mhe, strict, hints) - -def _mime_handler_message_externalbody(msg, mhe, strict, hints): - return [{mhe.maintype : 'not implemented yet', mhe.subtype : 'problems!'}] - -# def _mime_handler_message_news(msg, mhe, strict, hints): -# pass -# Currently just treat as application/octet-stream. - -def _mime_handler_message_partial(msg, mhe, strict, hints): - - if strict: - raise MIMEPartError(msg, mhe, 'not_implemented') - else: - return [] - -def _mime_handler_message_rfc822(msg, mhe, strict, hints): - - if not hints['descend_message_rfc822_attachments']: - # Treat as a binary attachment. - return _get_mime_handler('application', 'octet-stream')(msg, mhe, strict, hints) - else: - # Descend into the message as if it were a multipart/mixed - # type. - return _get_mime_handler('multipart', 'mixed')(msg, mhe, strict, hints) - -def _mime_handler_multipart_alternative(msg, mhe, strict, hints): - - # We handle multipart alternative just like multipart mixed, but - # then pick our prefered alternative, storing the remaining - # alternatives in mhe.rejected_alternatives. - - (prefered, rejects) = _pick_from_alternatives(_get_mime_handler('multipart', 'mixed')(msg, mhe, strict, hints)) - prefered['rejected_alternatives'] = rejects - return [ prefered ] - -def _mime_handler_multipart_appledouble(msg, mhe, strict, hints): - return [{mhe.maintype : 'not implemented yet', mhe.subtype : 'problems!'}] -def _mime_handler_multipart_encrypted(msg, mhe, strict, hints): - return [{mhe.maintype : 'not implemented yet', mhe.subtype : 'problems!'}] -def _mime_handler_multipart_mixed(msg, mhe, strict, hints): - - # We ignore Content-Disposition for multipart/mixed parts, as want - # to process them the same regardless. - - # Generate mime helpers for each part of the multipart collection: - mime_helpers = map(_MimeHelper, mhe.msg_part.get_payload()) - - # Get a mime handler for each part, and execute it: - f = lambda mhe: _get_mime_handler(mhe.maintype, mhe.subtype)(msg, mhe, strict, hints) - - list_of_lists_of_processed_parts = map(f, mime_helpers) - - # Flatten the results: - return _concat(list_of_lists_of_processed_parts) - -def _mime_handler_multipart_signed(msg, mhe, strict, hints): - return [{mhe.maintype : 'not implemented yet', mhe.subtype : 'problems!'}] - -def _mime_handler_multipart_unrecognized(msg, mhe, strict, hints): - - if hints['archive_multipart_unrecognized']: - _archive_part(msg, mhe, strict, hints) - return [ _format_msg_part_data(mhe, hints) ] - else: - # Descend into the message as if it were a multipart/mixed - # type. - return _get_mime_handler('multipart', 'mixed')(msg, mhe, strict, hints) - -def _mime_handler_multipart_parallel(msg, mhe, strict, hints): - - if hints['archive_multipart_parallel']: - _archive_part(msg, mhe, strict, hints) - return [ _format_msg_part_data(mhe, hints) ] - else: - # Descend into the message as if it were a multipart/mixed - # type. - return _get_mime_handler('multipart', 'mixed')(msg, mhe, strict, hints) - -def _mime_handler_multipart_related(msg, mhe, strict, hints): - - if hints['archive_multipart_related']: - _archive_part(msg, mhe, strict, hints) - return [ _format_msg_part_data(mhe, hints) ] - else: - # Descend into the message as if it were a multipart/mixed - # type. - return _get_mime_handler('multipart', 'mixed')(msg, mhe, strict, hints) - -def _mime_handler_text_enriched(msg, mhe, strict, hints): - - # Covert the text/enriched data to plain text and store it: - # mhe.file is already a unicode string. - - # enriched2txt function doesn't have any public errors: - mhe.downgrading_to_text = _enriched2txt.enriched2txt(_native2unicode(mhe.decoded_payload, native_charset=mhe.charset, strict=strict)) - return [ _format_msg_part_data(mhe, hints) ] - -def _mime_handler_text_html(msg, mhe, strict, hints): - - # Covert the text/richtext data to plain text and store it. We - # pass richtext2txt the original non-unicode text string and it - # will pass us back a unicode string: - - try: - # html2txt expects unicode in, and spits unicode out: - mhe.downgrading_to_text = _html2txt.html2txt(_native2unicode(mhe.decoded_payload, native_charset=mhe.charset, strict=strict), cols=72) - except _html2txt.HTMLParsingFailed: - if strict: - raise MIMEPartError(msg, mhe, 'downgrading_to_text') - else: - mhe.downgrading_to_text = None - - return [ _format_msg_part_data(mhe, hints) ] - -def _mime_handler_text_plain(msg, mhe, strict, hints): - - return [ _format_msg_part_data(mhe, hints) ] - -def _mime_handler_text_richtext(msg, mhe, strict, hints): - - # Covert the text/richtext data to plain text and store it. We - # pass richtext2txt the original non-unicode text string and it - # will pass us back a unicode string: - - try: - # richtext2txt always returns unicode for us: - mhe.downgrading_to_text = _richtext2txt.richtext2txt(mhe.decoded_payload, charset=mhe.charset, - convert_iso_8859_tags=True, force_conversion=(not strict)) - except _richtext2txt.RichTextConversionError: - if strict: - raise MIMEPartError(msg, mhe, 'downgrading_to_text') - else: - mhe.downgrading_to_text = None - - return [ _format_msg_part_data(mhe, hints) ] - -def _mime_handler_text_rtf(msg, mhe, strict, hints): - - # Note: This parser has some unicode issues which need to be - # fixed! The project seems fairly active... - - # Use RtfLib to convert rtf string to text. - try: - mhe.downgrading_to_text = rtf.Rtf2Txt.getTxt(_native2unicode(mhe.decoded_payload, native_charset=mhe.charset, strict=strict)) - except _RtfException: - if strict: - raise MIMEPartError(msg, mhe, 'downgrading_to_text') - else: - mhe.downgrading_to_text = None - - return [ _format_msg_part_data(mhe, hints) ] - -# Content-Type to Handler mappings: - -_mime_handler_map_application = { 'applefile' : _mime_handler_application_applefile, - 'octet-stream' : _mime_handler_application_octetstream, - 'pgp-encrypted' : _mime_handler_application_pgpencrypted, - 'pgp-keys' : _mime_handler_application_pgpkeys, - 'pgp-signature' : _mime_handler_application_pgpsignature, - 'rtf' : _mime_handler_application_rtf } - -_mime_handler_map_audio = { } # No special audio handlers defined. - -_mime_handler_map_image = { } # No special image handlers defined. - -_mime_handler_map_message = { 'external-body' : _mime_handler_message_externalbody, -# 'news' : _mime_handler_application_octetstream, - 'partial' : _mime_handler_message_partial, # not supported! - 'rfc822' : _mime_handler_message_rfc822 } - -_mime_handler_map_model = { } # No special models handlers defined. - -_mime_handler_map_multipart = { 'alternative' : _mime_handler_multipart_alternative, - 'appledouble' : _mime_handler_multipart_appledouble, - 'encrypted' : _mime_handler_multipart_encrypted, - 'mixed' : _mime_handler_multipart_mixed, - 'parallel' : _mime_handler_multipart_parallel, - 'related' : _mime_handler_multipart_related, - 'signed' : _mime_handler_multipart_signed } - -_mime_handler_map_text = { 'enriched' : _mime_handler_text_enriched, - 'html' : _mime_handler_text_html, - 'plain' : _mime_handler_text_plain, - 'richtext' : _mime_handler_text_richtext, - 'rtf' : _mime_handler_text_rtf } - -_mime_handler_map_video = { } # No special video handlers defined. - -_mime_handler_map = { 'application' : _mime_handler_map_application, - 'audio' : _mime_handler_map_audio, - 'image' : _mime_handler_map_image, - 'message' : _mime_handler_map_message, - 'model' : _mime_handler_map_model, - 'multipart' : _mime_handler_map_multipart, - 'text' : _mime_handler_map_text, - 'video' : _mime_handler_map_video } - -# Unrecognized types are handled according to the recomendations of -# RFC2046 which mandates that unrecognized parts of given maintype be -# dealt with as follows: - -# application -> application/octet-stream -# audio -> application/octet-stream -# image -> application/octet-stream -# message -> application/octet-stream -# model -> application/octet-stream - -# In the multipart case, however, we give the module client two -# choices of how to treat unrecognized multipart sections: either as -# multipart/mixed, or to wrap up each of the sub-parts into a tar.gz -# and present this as if it had been a single attachment. - -# multipart -> multipart/mixed -# text -> text/plain -# video -> application/octet-stream - -_mime_handler_map_unrecognized_subtype = { 'application' : _mime_handler_application_octetstream, - 'audio' : _mime_handler_application_octetstream, - 'image' : _mime_handler_application_octetstream, - 'message' : _mime_handler_application_octetstream, - 'model' : _mime_handler_application_octetstream, - 'multipart' : _mime_handler_multipart_unrecognized, - 'text' : _mime_handler_text_plain, - 'video' : _mime_handler_application_octetstream } - -_mime_handler_unrecognized_maintype = _mime_handler_application_octetstream - -# Message Creation: - -# Whereas ParseMessage is a class, CreateMessage is just a function -# which returns the email as an ascii byte string. - -# Creation is an order of magnitude simpler than parsing. When parsing -# we have to try and be able to cope with everything seen out 'in the -# wild'. With creation, we can simply restrict what is allowed to be -# created to a sensible set of options. - -# CreateMessage restricts you to a single plain text body plus any -# number of attached files and any number of attached emails. This -# will all be stuffed into a single multipart/mixed container (unless -# there is only a single part to be added, in which case we skip the -# multipart/mixed container). This is how email should be sent by good -# internet citizens. If this doesn't fit your needs, then your needs -# are esoteric (and if you want to send html email, then you're just -# plain evil)! - -def CreateMessage(_from, - to, - subject, - cc=None, - bcc=None, - message=None, - attach_messages=[], - attach_files=[], - message_id=None, - references=None, - in_reply_to=None, - date=None, - wrap_message=False, - cols=80): - - """ - Returns a byte string containing the email constructed from the - following arguments: - - _from: Either: 1. An ascii string already suitable for inclusion - in this email header (eg. a string you have - torn directory out of another email. - - 2. A 2-tuple (name, email_address), where name is - a persons name and email_address is their - email address. name must be a unicode object. - email_address can be either a unicode object - or a byte string. - - to, - cc, - bcc: Either: 1. An ascii string already suitable for inclusion - in this email header (eg. a string you have - torn directory out of another email. - - 2. A _list_ of items defined in the same way as - _from option 1. - - subject: Either: 1. An ascii string already suitable for inclusion - in this email header (eg. a string you have - torn directory out of another email. - - 2. A unicode object. - - message: A unicode object containing what will be the - message body text. - - attach_files: A list of 2-tuples, (filename, open_file_object) - where filename must be a unicode object and - open_file_object must be an open python file - object in mode 'rb'. - - message_id: An ascii string containing a message-id. - - references: A list of objects defined like argument message_id. - - in_reply_to: A list of objects defined like argument message_id. - - date: A ascii string containing an rfc822 formatted date string. - - wrap_message: True/False whether you want to have the message body - wrapped to the width given in argument cols. - - cols: A integer column width. - """ - - if message is not None: - mime_message = [_mimeify_message(message, wrap_message, cols)] - else: - mime_message = [] - - mime_attached_messages = map(_mimeify_attach_message, attach_messages) - mime_attached_files = map(_mimeify_attach_file, attach_files) - - mime_parts = mime_message + mime_attached_messages + mime_attached_files - - if mime_parts == []: - raise EZEmailCreateError("At least one of message, attach_messages or attach_files must be specified.") - elif len(mime_parts) == 1: - # Only one payload, so don't need multipart. - main_part = mime_parts[0] - else: - main_part = email.MIMEMultipart.MIMEMultipart() - map(main_part.attach, mime_parts) - main_part.preamble = 'This message requires a mime aware email reader to be viewed correctly.\n' - # Force ending in newline: - main_part.epilogue = '' - - eH = email.Header.Header - - # The .encode() call here shouldn't be doing any encoding other - # splitting the header onto multiple continuation lines, since we - # are already providing eH with safely asciified strings. - - main_part['From'] = eH(_mimeify_address(_from)).encode() - main_part['Subject'] = eH(_mimeify_unstructured(subject)).encode() - - for (header, value) in [('To', to),('Cc', cc), ('Bcc', bcc)]: - - if value is None: - continue - - if isinstance(value, str): - main_part[header] = eH(value).encode() - else: - main_part[header] = eH(', '.join(map(_mimeify_address, value))).encode() - - if message_id is not None: - main_part['Message-ID'] = eH(message_id).encode() - else: - main_part['Message-ID'] = email.Utils.make_msgid() - - if references is not None: - main_part['References'] = eH(', '.join(references)).encode() - - if in_reply_to is not None: - main_part['In-Reply-To'] = eH(in_reply_to).encode() - - if date is not None: - main_part['Date'] = eH(date).encode() - else: - main_part['Date'] = email.Utils.formatdate() - -# s = smtplib.SMTP() -# print ">>>fnah" -# s.connect(host='smtp.ox.ac.uk') -# s.sendmail('one@tes.la', 'foo@tes.la', main_part.as_string()) -# s.close() - - return main_part.as_string() - -def _mimeify_message(message, wrap_message, cols): - - if wrap_message: - message = _wrap_text(message, cols) - - if _just_ascii(message): - charset = 'us-ascii' - else: - charset = 'utf8' - - msg_part = email.MIMEText.MIMEText(_text=message.encode(charset), - _subtype='plain', - _charset=charset) - - msg_part.add_header('Content-Disposition', 'inline') - - return msg_part - -def _mimeify_attach_message(message_rfc822): - - message_rfc822 = email.message_from_string(message_rfc822) - return email.MIMEMessage.MIMEMessage(message_rfc822, 'rfc822') - -def _mimeify_attach_file((filename_unicode, fh)): - # fh = python file handle - - # Guess the content type based on file extension. - content_type, encoding = mimetypes.guess_type(filename_unicode) - - if encoding == 'gzip': - content_type = 'application/x-gzip' - elif encoding == 'compress': - content_type = 'application/x-gzip' - elif encoding is not None: - # we don't recognize the encoding: - content_type = 'application/octet-stream' - else: - # encoding is None; we are safe to use the content_type - # returned by mimetypes. - pass - - # Check that mimetypes actually returned a content_type: - if content_type is None: - content_type = 'application/octet-stream' - - maintype, subtype = content_type.split('/', 1) - - if maintype == 'text': - - # This is what we should be doing: - - # msg_part = email.MIMEText.MIMEText(fh.read(), _subtype=subtype) - - # but until I gather together character encoding detection, - # everything text is going to be attached as - # application/octet-stream. - - msg_part = email.MIMEBase.MIMEBase('application', 'octet-stream') - msg_part.set_payload(fh.read()) - - # Encode the payload using Base64 - email.Encoders.encode_base64(msg_part) - - elif maintype == 'image': - msg_part = email.MIMEImage.MIMEImage(fh.read(), _subtype=subtype) - elif maintype == 'audio': - msg_part = email.MIMEAudio.MIMEAudio(fh.read(), _subtype=subtype) - else: - - msg_part = email.MIMEBase.MIMEBase(maintype, subtype) - msg_part.set_payload(fh.read()) - - # Encode the payload using Base64 - email.Encoders.encode_base64(msg_part) - - # Set the filename parameter - msg_part.add_header('Content-Disposition', 'attachment') - _set_filename(msg_part, filename_unicode) - return msg_part - -def _mimeify_address(address): - - if isinstance(address, str): - return address - else: - (name, email_addr) = address - return email.Utils.formataddr((_mimeify_unstructured(name), email_addr)) - -def _set_filename(msg_part, filename_unicode): - - # Filename parameter of structured header gets rfc2231 encoded: - if _just_ascii(filename_unicode): - filename = filename_unicode.encode('us-ascii') - msg_part.set_param(param='filename', value=filename, - header='Content-Disposition') - else: - charset = 'utf8' - filename = filename_unicode.encode('utf8') - msg_part.set_param(param='filename', value=filename, - header='Content-Disposition', charset=charset) - -def _mimeify_unstructured(string): - - if not isinstance(string, unicode): - # Unstructured fields get RFC2047 encoded. - return string - elif _just_ascii(string): - return string.encode('us-ascii') - else: - return str(email.Header.make_header([(string.encode('utf8'), 'utf8')])) - -def _just_ascii(unicode_string): - # Are are the objects in the unicode string ascii character?: - return unicode_string.encode('utf8') == unicode_string.encode('us-ascii', 'ignore') - -# Error classes. - -class _EmailPackageError(Exception): - """ - Private error that will only be thrown for suspected programming - errors in the Python email package. - """ - -class EZEmailError(Exception): - pass - -class EZEmailParseError(EZEmailError): - - """ - An emtpy parent class for all public errors in this module. - """ - - def __init__(self, msg): - - """ - """ - - self.basic_email_info = _basic_email_info(msg) - Exception.__init__(self) - -class EZEmailCreateError(Exception): - pass - -class _EZEmailPrivateError(Exception): - - """ - An emtpy parent class for all private errors in this module. - """ - - pass - -class _UnicodeDecodingError(_EZEmailPrivateError): - - """ - This is a private error which can be raised if attempting to use - the unicode builtin fails because the charset we try to decode - from isn't recognized. - """ - - def __init__(self, value, charset): - - """ - Constructor takes single argument; a string giving the name of - the problem charset. - """ - - self.value = value - self.charset = charset - - -class _StructuredHeaderPairError(_EZEmailPrivateError): - - """ - This is a private error which will be raised if there is an error - trying to parse and rejoin a key/value pair from a structured - header. - """ - - def __init__(self, key, value): - - self.key = key - self.value = value - -class HeaderRFC2231Error(EZEmailParseError): - - """ - This error is raised if we can't decode a structured header - (eg. Content-Type or Content-Disposition) successfully. - """ - - def __init__(self, msg, header, header_value, key, key_value): - - self.header = header - self.header_value = header_value - self.key = key - self.key_value = key_value - - EZEmailParseError.__init__(self, msg) - -class HeaderCharsetError(EZEmailParseError): - - """ - This error is raised if we can't recognize one of the charsets - used in a particular header. - """ - - def __init__(self, msg, header, header_value, problem_part, charset): - - """ - Constructor takes an email.Message message object and header, - value and charset (in their original rfc2047 encoding) as - arguments and stores them. - """ - - self.header = header - self.header_value = header_value - self.problem_part = problem_part - self.charset = charset - - EZEmailParseError.__init__(self, msg) - - def __str__(self): - return "header: %s\nheader value: %s\nproblem part: %s\ncharset: %s" % (self.header, self.header_value, self.problem_part, self.charset) - -class HeaderRFC2047Error(EZEmailParseError): - - """ - This error is raised if we can't parse the RFC2047 encoding used - in a particular header. - """ - - def __init__(self, msg, header, value): - - """ - - Constructor takes an email.Message message object and header, - value and charset (in their original rfc2047 encoding) as - arguments and stores them. - - """ - - self.header = header - self.value = value - - EZEmailParseError.__init__(self) - - def __str__(self): - return "\nheader: %s\nvalue: %s\ninfo: %s" % (self.header, self.value, self.basic_email_info) - - -class FromHeaderParsingError(EZEmailParseError): - - """ - We have a From: header we can't parse. - """ - def __str__(self): - return "\ninfo: %s" % (self.basic_email_info) - - -class FromHeaderMissingError(EZEmailParseError): - - """ - Somehow we have recieved a seriously broken email with no From: header. Reject! - """ - - pass - - -class _ParseDateError(_EZEmailPrivateError): - - """ - - Private error raised when email.Utils.parsedate or - email.Utils.parsedate_tz fails to parse a date header value. - - """ - - pass - -class ParseDateError(EZEmailParseError): - - """ - - Public error raised when email.Utils.parsedate or - email.Utils.parsedate_tz fails to parse a date header value. - - """ - - pass - -class _CTEDecodingError(_EZEmailPrivateError): - - pass - -class _MultipartCTEDecodingAttempt(_EZEmailPrivateError): - - """ - Raised if an attempt is made to CTE decode a multipart message - part. - """ - - pass - -class MIMEPartError(EZEmailParseError): - - def __init__(self, msg, mhe, error_type): - - self.maintype = mhe.maintype - self.subtype = mhe.subtype - self.filename = mhe.filename - - if mhe.decoded_payload is None or mhe.msg_part.is_multipart(): - # If we haven't decoded payload successfully, take sample - # from CTE encoded payload: - self.sample = mhe.msg_part.get_payload()[0:100] - else: - # Otherwise, take sample from CTE decoded payload: - self.sample = mhe.decoded_payload[0:100] - - if error_type in self.valid_error_types: - self.error_type = error_type - else: - raise ValueError('Programming Error: error_type = \'' + error_type + - '\' is not valid for MIME parts') - - EZEmailParseError.__init__(self, msg) - - valid_error_types = ['cte_decoding', 'filename_decoding', 'downgrading_to_text', - 'unicode_conversion', 'not_implemented'] - - def __str__(self): - - return "maintype: %s\nsubtype: %s\nfilename: %s\nsample: %s\nerror_type: %s" % (self.maintype, self.subtype, self.filename, self.sample, self.error_type) - -class EZEmailCreateError(EZEmailError): - pass - - -if __name__ == "__main__": - import sys -# import profile - def f(): - for filename in sys.stdin.xreadlines(): - print filename, - filename = filename[:-1] - contents(filename) - print "===" - a = ParseMessage(open(filename, 'rb').read(), strict=False) - print a.primary_message() - f() -# profile.run('f()') - - diff --git a/modules/elmsubmit/lib/elmsubmit_doctype_test.py b/modules/elmsubmit/lib/elmsubmit_doctype_test.py index 473164836..ad729d167 100644 --- a/modules/elmsubmit/lib/elmsubmit_doctype_test.py +++ b/modules/elmsubmit/lib/elmsubmit_doctype_test.py @@ -1,79 +1,76 @@ -# -*- coding: utf-8 -*- - -## $Id$ - +# -*- coding: utf-8 -*- +## +## $Id$ +## ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - import cdsware.elmsubmit as elmsubmit import cdsware.websubmit_engine as websubmit_engine from cdsware.elmsubmit_misc import dict2file as _dict2file import os.path required_fields = ['title', 'author', 'date', 'files'] doctype = 'TEST' def handler(msg, submission_dict, elmconf): # Process the files list: elmsubmit.process_files(msg, submission_dict) # Get a submission directory: storage_dir = elmsubmit.get_storage_dir(msg, doctype) access = os.path.basename(storage_dir) # Write the neccessary data format out to submission directory: try: _dict2file(submission_dict, storage_dir) except EnvironmentError: response_email = elmconf.nolangmsgs.temp_problem admin_response_email = "There was a problem writing data to directory %s." % (storage_dir) error = elmsubmit.elmsubmitError("There was a problem writing data to directory %s." % (storage_dir)) return (response_email, admin_response_email, error) # Pass the submission to CDSware proper: try: websubmit_engine.simpleendaction(doctype=doctype, act="SBI", startPg=1, indir=os.path.basename(elmconf.files.maildir), access=access) except websubmit_engine.functionError, e: response_email = elmconf.nolangmsgs.temp_problem admin_response_email = None error = elmsubmit.elmsubmitError("elmsubmit encountered websubmit functionError error: " + e.value) return (response_email, admin_response_email, error) except websubmit_engine.functionStop, e: response_email = elmconf.nolangmsgs.temp_problem admin_response_email = "elmsubmit encountered websubmit error: " + e.value error = elmsubmit.elmsubmitError("elmsubmit encountered websubmit functionStop error: " + e.value) return (response_email, admin_response_email, error) # CDSWare proper will now email the user for us. return (None, None, None) - + diff --git a/modules/elmsubmit/lib/elmsubmit_doctype_test.py.wml b/modules/elmsubmit/lib/elmsubmit_doctype_test.py.wml deleted file mode 100644 index 473164836..000000000 --- a/modules/elmsubmit/lib/elmsubmit_doctype_test.py.wml +++ /dev/null @@ -1,79 +0,0 @@ -# -*- coding: utf-8 -*- - -## $Id$ - -## This file is part of the CERN Document Server Software (CDSware). -## Copyright (C) 2002, 2003, 2004, 2005 CERN. -## -## The CDSware is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License as -## published by the Free Software Foundation; either version 2 of the -## License, or (at your option) any later version. -## -## The CDSware is distributed in the hope that it will be useful, but -## WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -## General Public License for more details. -## -## You should have received a copy of the GNU General Public License -## along with CDSware; if not, write to the Free Software Foundation, Inc., -## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - -import cdsware.elmsubmit as elmsubmit -import cdsware.websubmit_engine as websubmit_engine -from cdsware.elmsubmit_misc import dict2file as _dict2file -import os.path - -required_fields = ['title', - 'author', - 'date', - 'files'] - -doctype = 'TEST' - -def handler(msg, submission_dict, elmconf): - - # Process the files list: - - elmsubmit.process_files(msg, submission_dict) - - # Get a submission directory: - - storage_dir = elmsubmit.get_storage_dir(msg, doctype) - access = os.path.basename(storage_dir) - - # Write the neccessary data format out to submission directory: - - try: - _dict2file(submission_dict, storage_dir) - except EnvironmentError: - response_email = elmconf.nolangmsgs.temp_problem - admin_response_email = "There was a problem writing data to directory %s." % (storage_dir) - error = elmsubmit.elmsubmitError("There was a problem writing data to directory %s." % (storage_dir)) - return (response_email, admin_response_email, error) - - # Pass the submission to CDSware proper: - - try: - websubmit_engine.simpleendaction(doctype=doctype, act="SBI", startPg=1, - indir=os.path.basename(elmconf.files.maildir), - access=access) - except websubmit_engine.functionError, e: - response_email = elmconf.nolangmsgs.temp_problem - admin_response_email = None - error = elmsubmit.elmsubmitError("elmsubmit encountered websubmit functionError error: " + e.value) - return (response_email, admin_response_email, error) - - except websubmit_engine.functionStop, e: - response_email = elmconf.nolangmsgs.temp_problem - admin_response_email = "elmsubmit encountered websubmit error: " + e.value - error = elmsubmit.elmsubmitError("elmsubmit encountered websubmit functionStop error: " + e.value) - return (response_email, admin_response_email, error) - - # CDSWare proper will now email the user for us. - return (None, None, None) - - diff --git a/modules/elmsubmit/lib/elmsubmit_enriched2txt.py b/modules/elmsubmit/lib/elmsubmit_enriched2txt.py index 80c1e15f0..d9e822b38 100644 --- a/modules/elmsubmit/lib/elmsubmit_enriched2txt.py +++ b/modules/elmsubmit/lib/elmsubmit_enriched2txt.py @@ -1,231 +1,227 @@ -# -*- coding: utf-8 -*- - -## $Id$ - +# -*- coding: utf-8 -*- +## +## $Id$ +## ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - - """ A text/enriched to text/plain converter. This is a module exporting a single function enriched2txt which takes as its argument a string of 'enriched text' and returns its conversion to 'plain text'. 'enriched text' is the text format as specified in RFC1896 for use as an email payload with mime type text/enriched. Note that it is somewhat simpler than the text/richtext converter (see elmsubmit_richtext2txt.py); this is largely thanks to the enriched text specification attempting to remove many of the complexities found in text/richtext; eg. superscript tags, iso-8859-x tags. If you hand enriched2txt a regular string, the algorithm assumes 7-bit ascii. If you wish to parse internationalized text, make sure you either: 1. Use an encoding that can be treated safely as if it were 7-bit ascii (eg. utf-8) or better: 2. Pass in a unicode object. This function is a direct conversion of the C code from the appendix of RFC1896 which gives a sample enriched text to plain text converter. This is a quick conversion job, since text/enriched email payload is fairly rare these days and so not worth too much time considering. I haven't paid much thought as to the quality of the original algorithm --- hopefully the RFC writer had his thinking cap on straight; it seems to produce fairly reasonable output on test documents. Note that one difference in the python version of the parser is that it allows markup tokens of unlimited size. Unlike the specification for text/richtext (see RFC1341), only one charset is allowed in any text/enriched file. Quoting RFC1896: > 1 For cases where the different types of non-ASCII text can be > limited to their own paragraphs with distinct formatting, a > multipart message can be used with each part having a Content-Type > of text/enriched and a different charset parameter. The one caveat > to using this method is that each new part must start in the initial > state for a text/enriched document. That means that all of the > text/enriched commands in the preceding part must be properly > balanced with ending commands before the next text/enriched part > begins. Also, each text/enriched part must begin a new paragraph. > 2 If different types of non-ASCII text are to appear in the same > line or paragraph, or if text/enriched formatting (e.g. margins, > typeface, justification) is required across several different types > of non-ASCII text, a single text/enriched body part should be used > with a character set specified that contains all of the required > characters. For example, a charset parameter of "UNICODE-1-1-UTF-7" > as specified in [RFC-1642] could be used for such purposes. Not only > does UNICODE contain all of the characters that can be represented > in all of the other registered ISO 8859 MIME character sets, but > UTF-7 is fully compatible with other aspects of the text/enriched > standard, including the use of the "<" character referred to > below. Any other character sets that are specified for use in MIME > which contain different types of non-ASCII text can also be used in > these instances. """ def enriched2txt(string): # f and g will be our input/output streams. # We instantiate them as cStringIO objects for speed if the input # string is not unicode (ie. its a normal string type). Otherwise # we make them StringIO objects. if type(string) != unicode: import cStringIO # Create file like object from string for input file. f = cStringIO.StringIO(string) # Create another file like object from string for output file. g = cStringIO.StringIO() else: import StringIO # Create file like object from string for input file. f = StringIO.StringIO(string) # Create another file like object from string for output file. g = StringIO.StringIO(u'') # From here on in we are almost identical to the RFC1896 code, except substitute: # STDIN -> object f # STDOUT -> object g # EOF -> '' # ungetc -> seek(-1,1) paramct = 0 newlinect = 0 nofill = 0 c = f.read(1) while c != '': if (c == '<'): if newlinect == 1: g.write(' ') newlinect = 0; c = f.read(1) if (c == '<'): if paramct <= 0: g.write(c) else: f.seek(-1,1) token = "" c = f.read(1) while c != '' and c!= '>': token += c c = f.read(1) if c == '': break token = token.lower() if token == 'param': paramct += 1 elif token == 'nofill': nofill += 1 elif token == '/param': paramct -= 1 elif token == '/nofill': nofill -= 1 else: if paramct > 0: pass # ignore params elif c == '\n' and nofill <= 0: newlinect += 1 if newlinect > 1: g.write(c) else: if newlinect == 1: g.write(' ') newlinect = 0 g.write(c) c = f.read(1) g.write('\n') return g.getvalue() # The original C code direct from RFC1896 appendix. # See: http://people.qualcomm.com/presnick/textenriched.html # #include # #include # #include # #include # main() { # int c, i, paramct=0, newlinect=0, nofill=0; # char token[62], *p; # while ((c=getc(stdin)) != EOF) { # if (c == '<') { # if (newlinect == 1) putc(' ', stdout); # newlinect = 0; # c = getc(stdin); # if (c == '<') { # if (paramct <= 0) putc(c, stdout); # } else { # ungetc(c, stdin); # for (i=0, p=token; (c=getc(stdin)) != EOF && c != '>'; i++) { # if (i < sizeof(token)-1) # *p++ = isupper(c) ? tolower(c) : c; # } # *p = '\0'; # if (c == EOF) break; # if (strcmp(token, "param") == 0) # paramct++; # else if (strcmp(token, "nofill") == 0) # nofill++; # else if (strcmp(token, "/param") == 0) # paramct--; # else if (strcmp(token, "/nofill") == 0) # nofill--; # } # } else { # if (paramct > 0) # ; /* ignore params */ # else if (c == '\n' && nofill <= 0) { # if (++newlinect > 1) putc(c, stdout); # } else { # if (newlinect == 1) putc(' ', stdout); # newlinect = 0; # putc(c, stdout); # } # } # } # /* The following line is only needed with line-buffering */ # putc('\n', stdout); # exit(0); # } - + diff --git a/modules/elmsubmit/lib/elmsubmit_enriched2txt.py.wml b/modules/elmsubmit/lib/elmsubmit_enriched2txt.py.wml deleted file mode 100644 index 80c1e15f0..000000000 --- a/modules/elmsubmit/lib/elmsubmit_enriched2txt.py.wml +++ /dev/null @@ -1,231 +0,0 @@ -# -*- coding: utf-8 -*- - -## $Id$ - -## This file is part of the CERN Document Server Software (CDSware). -## Copyright (C) 2002, 2003, 2004, 2005 CERN. -## -## The CDSware is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License as -## published by the Free Software Foundation; either version 2 of the -## License, or (at your option) any later version. -## -## The CDSware is distributed in the hope that it will be useful, but -## WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -## General Public License for more details. -## -## You should have received a copy of the GNU General Public License -## along with CDSware; if not, write to the Free Software Foundation, Inc., -## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - - -""" -A text/enriched to text/plain converter. - -This is a module exporting a single function enriched2txt which -takes as its argument a string of 'enriched text' and returns its -conversion to 'plain text'. 'enriched text' is the text format as -specified in RFC1896 for use as an email payload with mime type -text/enriched. - -Note that it is somewhat simpler than the text/richtext converter (see -elmsubmit_richtext2txt.py); this is largely thanks to the enriched -text specification attempting to remove many of the complexities found -in text/richtext; eg. superscript tags, iso-8859-x tags. - -If you hand enriched2txt a regular string, the algorithm assumes -7-bit ascii. If you wish to parse internationalized text, make sure -you either: - -1. Use an encoding that can be treated safely as if it were 7-bit - ascii (eg. utf-8) - -or better: - -2. Pass in a unicode object. - -This function is a direct conversion of the C code from the appendix -of RFC1896 which gives a sample enriched text to plain text -converter. This is a quick conversion job, since text/enriched email -payload is fairly rare these days and so not worth too much time -considering. I haven't paid much thought as to the quality of the -original algorithm --- hopefully the RFC writer had his thinking cap -on straight; it seems to produce fairly reasonable output on test -documents. - -Note that one difference in the python version of the parser is that -it allows markup tokens of unlimited size. - -Unlike the specification for text/richtext (see RFC1341), only one -charset is allowed in any text/enriched file. Quoting RFC1896: - -> 1 For cases where the different types of non-ASCII text can be -> limited to their own paragraphs with distinct formatting, a -> multipart message can be used with each part having a Content-Type -> of text/enriched and a different charset parameter. The one caveat -> to using this method is that each new part must start in the initial -> state for a text/enriched document. That means that all of the -> text/enriched commands in the preceding part must be properly -> balanced with ending commands before the next text/enriched part -> begins. Also, each text/enriched part must begin a new paragraph. - -> 2 If different types of non-ASCII text are to appear in the same -> line or paragraph, or if text/enriched formatting (e.g. margins, -> typeface, justification) is required across several different types -> of non-ASCII text, a single text/enriched body part should be used -> with a character set specified that contains all of the required -> characters. For example, a charset parameter of "UNICODE-1-1-UTF-7" -> as specified in [RFC-1642] could be used for such purposes. Not only -> does UNICODE contain all of the characters that can be represented -> in all of the other registered ISO 8859 MIME character sets, but -> UTF-7 is fully compatible with other aspects of the text/enriched -> standard, including the use of the "<" character referred to -> below. Any other character sets that are specified for use in MIME -> which contain different types of non-ASCII text can also be used in -> these instances. -""" - -def enriched2txt(string): - - # f and g will be our input/output streams. - - # We instantiate them as cStringIO objects for speed if the input - # string is not unicode (ie. its a normal string type). Otherwise - # we make them StringIO objects. - - if type(string) != unicode: - - import cStringIO - - # Create file like object from string for input file. - f = cStringIO.StringIO(string) - - # Create another file like object from string for output file. - g = cStringIO.StringIO() - - else: - - import StringIO - - # Create file like object from string for input file. - f = StringIO.StringIO(string) - - # Create another file like object from string for output file. - g = StringIO.StringIO(u'') - - # From here on in we are almost identical to the RFC1896 code, except substitute: - # STDIN -> object f - # STDOUT -> object g - # EOF -> '' - # ungetc -> seek(-1,1) - - paramct = 0 - newlinect = 0 - nofill = 0 - - c = f.read(1) - - while c != '': - if (c == '<'): - if newlinect == 1: g.write(' ') - newlinect = 0; - c = f.read(1) - if (c == '<'): - if paramct <= 0: g.write(c) - else: - f.seek(-1,1) - token = "" - c = f.read(1) - - while c != '' and c!= '>': - token += c - c = f.read(1) - - if c == '': break - - token = token.lower() - - if token == 'param': - paramct += 1 - elif token == 'nofill': - nofill += 1 - elif token == '/param': - paramct -= 1 - elif token == '/nofill': - nofill -= 1 - - else: - if paramct > 0: - pass # ignore params - elif c == '\n' and nofill <= 0: - newlinect += 1 - if newlinect > 1: g.write(c) - else: - if newlinect == 1: g.write(' ') - newlinect = 0 - g.write(c) - - c = f.read(1) - - g.write('\n') - - return g.getvalue() - -# The original C code direct from RFC1896 appendix. -# See: http://people.qualcomm.com/presnick/textenriched.html - -# #include -# #include -# #include -# #include - -# main() { -# int c, i, paramct=0, newlinect=0, nofill=0; -# char token[62], *p; - -# while ((c=getc(stdin)) != EOF) { -# if (c == '<') { -# if (newlinect == 1) putc(' ', stdout); -# newlinect = 0; -# c = getc(stdin); -# if (c == '<') { -# if (paramct <= 0) putc(c, stdout); -# } else { -# ungetc(c, stdin); -# for (i=0, p=token; (c=getc(stdin)) != EOF && c != '>'; i++) { -# if (i < sizeof(token)-1) -# *p++ = isupper(c) ? tolower(c) : c; -# } -# *p = '\0'; -# if (c == EOF) break; -# if (strcmp(token, "param") == 0) -# paramct++; -# else if (strcmp(token, "nofill") == 0) -# nofill++; -# else if (strcmp(token, "/param") == 0) -# paramct--; -# else if (strcmp(token, "/nofill") == 0) -# nofill--; -# } -# } else { -# if (paramct > 0) -# ; /* ignore params */ -# else if (c == '\n' && nofill <= 0) { -# if (++newlinect > 1) putc(c, stdout); -# } else { -# if (newlinect == 1) putc(' ', stdout); -# newlinect = 0; -# putc(c, stdout); -# } -# } -# } -# /* The following line is only needed with line-buffering */ -# putc('\n', stdout); -# exit(0); -# } - - diff --git a/modules/elmsubmit/lib/elmsubmit_field_validation.py b/modules/elmsubmit/lib/elmsubmit_field_validation.py index 14db8b4a2..6461debe3 100644 --- a/modules/elmsubmit/lib/elmsubmit_field_validation.py +++ b/modules/elmsubmit/lib/elmsubmit_field_validation.py @@ -1,98 +1,94 @@ -# -*- coding: utf-8 -*- - -## $Id$ - +# -*- coding: utf-8 -*- +## +## $Id$ +## ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - - import re def author(value): """ The author list must be in the following format: Put one author per line, and a comma ',' (with no preceding space) between the name and the firstname initial letters. The name is going first, followed by the firstname initial letters. Precede each initial by a single space. Place only a single space between surnames. Example: Put Le Meur, J Y Baron, T for Le Meur Jean-Yves & Baron Thomas. """ # Strip each line of leading/trainling whitespace and remove blank lines. value = '\n'.join(filter(lambda line: line != '', map(lambda line: line.strip(), value.splitlines()))) # txt = txt.replace("\r\n", "\n") # Change to unix newline conventions. # Allow names like: # 'MacDonald Schlüter Wolsey-Smith, P J' hyphenated_word = r'\w+(-\w+)*' author_surname = r'%s( %s)*' % (hyphenated_word, hyphenated_word) comma_space = r', ' initials = r'\w( \w)*' author_re = author_surname + comma_space + initials # Allow multiline list with no trailing spaces, and only single # (optional) terminating newline: author_list = r'(?u)^%s(\n%s)*?$' % (author_re, author_re) if re.compile(author_list).search(value): return (author.__doc__, value, True) else: return (author.__doc__, value, False) def date(value): """ The date field must be in dd/mm/yyyy format. eg. 01/03/2010 """ value = value.strip() day = '(3[01]|[12][0-9]|0[1-9])' month = '(1[012]|0[1-9])' year = '(\d\d\d\d)' date_re = r'^%s/%s/%s(?!\n)$' % (day, month, year) if re.compile(date_re).search(value): return (date.__doc__, value, True) else: return (date.__doc__, value, False) def files(value): # Strip each line of leading/trainling whitespace and remove blank lines. # Lowercase each filename. value = '\n'.join(filter(lambda line: line != '', map(lambda line: line.strip().lower(), value.splitlines()))) return (files.__doc__, value, True) - + diff --git a/modules/elmsubmit/lib/elmsubmit_field_validation.py.wml b/modules/elmsubmit/lib/elmsubmit_field_validation.py.wml deleted file mode 100644 index 14db8b4a2..000000000 --- a/modules/elmsubmit/lib/elmsubmit_field_validation.py.wml +++ /dev/null @@ -1,98 +0,0 @@ -# -*- coding: utf-8 -*- - -## $Id$ - -## This file is part of the CERN Document Server Software (CDSware). -## Copyright (C) 2002, 2003, 2004, 2005 CERN. -## -## The CDSware is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License as -## published by the Free Software Foundation; either version 2 of the -## License, or (at your option) any later version. -## -## The CDSware is distributed in the hope that it will be useful, but -## WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -## General Public License for more details. -## -## You should have received a copy of the GNU General Public License -## along with CDSware; if not, write to the Free Software Foundation, Inc., -## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - - -import re - -def author(value): - """ - The author list must be in the following format: - Put one author per line, and a comma ',' (with no preceding - space) between the name and the firstname initial letters. - - The name is going first, followed by the firstname initial - letters. Precede each initial by a single space. Place only a - single space between surnames. - - Example: Put - - Le Meur, J Y - Baron, T - - for - - Le Meur Jean-Yves & Baron Thomas. - """ - - # Strip each line of leading/trainling whitespace and remove blank lines. - value = '\n'.join(filter(lambda line: line != '', map(lambda line: line.strip(), value.splitlines()))) - - # txt = txt.replace("\r\n", "\n") # Change to unix newline conventions. - - # Allow names like: - # 'MacDonald Schlüter Wolsey-Smith, P J' - - hyphenated_word = r'\w+(-\w+)*' - author_surname = r'%s( %s)*' % (hyphenated_word, hyphenated_word) - comma_space = r', ' - initials = r'\w( \w)*' - author_re = author_surname + comma_space + initials - - # Allow multiline list with no trailing spaces, and only single - # (optional) terminating newline: - - author_list = r'(?u)^%s(\n%s)*?$' % (author_re, author_re) - - if re.compile(author_list).search(value): - return (author.__doc__, value, True) - else: - return (author.__doc__, value, False) - -def date(value): - """ - The date field must be in dd/mm/yyyy format. - eg. 01/03/2010 - """ - - value = value.strip() - - day = '(3[01]|[12][0-9]|0[1-9])' - month = '(1[012]|0[1-9])' - year = '(\d\d\d\d)' - date_re = r'^%s/%s/%s(?!\n)$' % (day, month, year) - - if re.compile(date_re).search(value): - return (date.__doc__, value, True) - else: - return (date.__doc__, value, False) - -def files(value): - - # Strip each line of leading/trainling whitespace and remove blank lines. - # Lowercase each filename. - value = '\n'.join(filter(lambda line: line != '', map(lambda line: line.strip().lower(), value.splitlines()))) - - return (files.__doc__, value, True) - - diff --git a/modules/elmsubmit/lib/elmsubmit_filename_generator.py b/modules/elmsubmit/lib/elmsubmit_filename_generator.py index 3f4b7ffe1..10d8134ab 100644 --- a/modules/elmsubmit/lib/elmsubmit_filename_generator.py +++ b/modules/elmsubmit/lib/elmsubmit_filename_generator.py @@ -1,231 +1,226 @@ -# -*- coding: utf-8 -*- - -## $Id$ - +# -*- coding: utf-8 -*- +## +## $Id$ +## ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - import sys import os.path _this_module = sys.modules[__name__] _this_module_dir = os.path.abspath(os.path.dirname(_this_module.__file__)) import random import tempfile import re import os import mimetypes try: import magic.magic as magic _got_magic = True except ImportError: import mimetypes _got_magic = False import gzip import bz2 from cdsware.elmsubmit_misc import open_tempfile as _open_tempfile from cdsware.elmsubmit_misc import random_alphanum_string as _random_alphanum_string from cdsware.elmsubmit_misc import remove_tempfile as _remove_tempfile def generate_filename(filename=None, file=None, content_type=None, no_rand_chars=8, prefix='', postfix=''): name_stub = _random_alphanum_string(no_rand_chars) name_ext = calculate_filename_extension(filename, file, content_type) return prefix + name_stub + postfix + '.' + name_ext def calculate_filename_extension(filename=None, file=None, content_type=None): # If libmagic is installed and ./magic/magic.so has been # successfully built, then we use a python interface to libmagic # (man libmagic) to calculate a file extension using a specially # prepared magic data file (./magic/magic.ext) which maps # magic tests to file extensions. Otherwise we use the mimetypes # module from the standard Python distribution. if (filename is None) and (file is None) and (content_type is None): raise TypeError('at least one of filename, file or content_type must be specified') elif (file is None) and (filename is None): # We only have content_type: return calculate_filename_ext_mimetypes(content_type) # We have at least one of file and filename, so we try to use libmagic elif _got_magic: return calculate_filename_ext_libmagic(filename, file) # We haven't got libmagic, so must use mimetypes: else: # But mimetypes requires content_type: if content_type is None: raise ImportError('Failed to import magic module. If no content-type is given, then magic module is required.') else: return calculate_filename_ext_mimetypes(content_type) def calculate_filename_ext_libmagic(filename=None, file=None): # See comments in magic/magic.ext for details of the format # of the data file. All file extensions if recognized by a magic # test will be returned in the form "file_ext:{xyz}"; this lets us # detect the "file_ext:{}" marker and know we have a successful # guess at the correct extension. The reason we need this marker # is that libmagic has many tests whose return value is not # governed through the magic data file and so we need some way of # being sure a file extension has been returned. eg: # >>> magician.file('/etc/init.d') # "symbolic link to `rc.d/init.d'" if filename is None and file is None: raise ValueError('at least one of file or content_type must be specified') if not _got_magic: raise ImportError('magic module did not import successfully') magician = magic.open(magic.MAGIC_NONE) magic_data_file = os.path.join(_this_module_dir, 'magic/magic.ext') ret_load = magician.load(magic_data_file) # Throw private error if the magic data file is corrupt, or # doesn't exist. if ret_load != 0: raise _MagicDataError() if filename is None: # then we have only been given file as binary string. # Get a temporary file and write file variable out to it # because the magic module expects to be handed the name of a # real file. tf, tf_name = _open_tempfile(mode='wb') tf.write(file) tf.close() delete_file = True else: os.stat(filename) # Make sure we can stat the file. tf_name = filename delete_file = False ext_info = magician.file(tf_name) # Now process ext_info to see if we can find a file extension # contained in it. file_ext_re = re.compile(r'file_ext:{(.+?)}') file_ext_match = file_ext_re.search(ext_info) if file_ext_match: name_ext = file_ext_match.group(1) # See if we have a compressed file type we can deal # with. If so, uncompress it and call ourself to get more # info: # Note that we could use the magic.MAGIC_COMPRESS flag to # get libmagic to do the decompression for us but: # 1. It only supports gzip # 2. The implementation has a nasty bug which has only # been fixed in very recent releases of libmagic. if name_ext == 'gz': try: # Decompress the stream: decomp_file = gzip.open(tf_name).read() except zlib.error: # Couldn't decompress sucessfully, so just stick # with extension we have. pass else: # Guess an extension of the decompressed stream and # tack current '.gz' on the end: name_ext = calculate_filename_ext_libmagic(file=decomp_file) + '.' + name_ext elif name_ext == 'bz2': try: # Decompress the file: decomp_file = bz2.BZ2File(tf_name).read() except IOError: # Couldn't decompress sucessfully, so just stick # with extension we have. pass else: # Guess an extension of the decompressed stream and # tack current '.bz2' on the end: name_ext = calculate_filename_ext_libmagic(file=decomp_file) + '.' + name_ext # Otherwise, look for special results from libmagic's # 'internal tests' that we recognize: elif ext_info.lower().rfind('tar archive') != -1: name_ext = 'tar' elif ext_info.lower().rfind('text') != -1: name_ext = 'txt' # Can't guess a filetype so use generic extension .dat else: name_ext = 'dat' # Identification done so get rid of the temp file, assuming we created the file: if delete_file: _remove_tempfile(tf_name) return name_ext mimetypes.types_map = {} mimetypes.init([os.path.join(_this_module_dir, 'mime.types.edited')]) def calculate_filename_ext_mimetypes(content_type): # mimetypes.types_map contains many 'builtin' maps. We empty # it because we only want to use the maps from our edited # mime.types file: name_ext = mimetypes.guess_extension(content_type) # Use '.dat' as generic file extension. if name_ext is None: name_ext = '.dat' # Remove leading dot produced by mimetypes. name_ext = name_ext[1:] return name_ext # Errors: # This module may also produce IOError from it use of temporary # files. # REMEMBER TO DOCUMENT THIS ERROR POTENTIAL class _MagicDataError(Exception): """ Private error raised when we cannot compile and load the magic data file successfully. This will only occur if there is a problem with the module's installation. """ pass - - diff --git a/modules/elmsubmit/lib/elmsubmit_filename_generator.py.wml b/modules/elmsubmit/lib/elmsubmit_filename_generator.py.wml deleted file mode 100644 index 3f4b7ffe1..000000000 --- a/modules/elmsubmit/lib/elmsubmit_filename_generator.py.wml +++ /dev/null @@ -1,231 +0,0 @@ -# -*- coding: utf-8 -*- - -## $Id$ - -## This file is part of the CERN Document Server Software (CDSware). -## Copyright (C) 2002, 2003, 2004, 2005 CERN. -## -## The CDSware is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License as -## published by the Free Software Foundation; either version 2 of the -## License, or (at your option) any later version. -## -## The CDSware is distributed in the hope that it will be useful, but -## WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -## General Public License for more details. -## -## You should have received a copy of the GNU General Public License -## along with CDSware; if not, write to the Free Software Foundation, Inc., -## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - -import sys -import os.path - -_this_module = sys.modules[__name__] -_this_module_dir = os.path.abspath(os.path.dirname(_this_module.__file__)) - -import random -import tempfile -import re -import os -import mimetypes - -try: - import magic.magic as magic - _got_magic = True -except ImportError: - import mimetypes - _got_magic = False - -import gzip -import bz2 - -from cdsware.elmsubmit_misc import open_tempfile as _open_tempfile -from cdsware.elmsubmit_misc import random_alphanum_string as _random_alphanum_string -from cdsware.elmsubmit_misc import remove_tempfile as _remove_tempfile - - -def generate_filename(filename=None, file=None, content_type=None, no_rand_chars=8, prefix='', postfix=''): - - name_stub = _random_alphanum_string(no_rand_chars) - name_ext = calculate_filename_extension(filename, file, content_type) - return prefix + name_stub + postfix + '.' + name_ext - -def calculate_filename_extension(filename=None, file=None, content_type=None): - - # If libmagic is installed and ./magic/magic.so has been - # successfully built, then we use a python interface to libmagic - # (man libmagic) to calculate a file extension using a specially - # prepared magic data file (./magic/magic.ext) which maps - # magic tests to file extensions. Otherwise we use the mimetypes - # module from the standard Python distribution. - - if (filename is None) and (file is None) and (content_type is None): - raise TypeError('at least one of filename, file or content_type must be specified') - elif (file is None) and (filename is None): - # We only have content_type: - return calculate_filename_ext_mimetypes(content_type) - # We have at least one of file and filename, so we try to use libmagic - elif _got_magic: - return calculate_filename_ext_libmagic(filename, file) - # We haven't got libmagic, so must use mimetypes: - else: - # But mimetypes requires content_type: - if content_type is None: - raise ImportError('Failed to import magic module. If no content-type is given, then magic module is required.') - else: - return calculate_filename_ext_mimetypes(content_type) - -def calculate_filename_ext_libmagic(filename=None, file=None): - - # See comments in magic/magic.ext for details of the format - # of the data file. All file extensions if recognized by a magic - # test will be returned in the form "file_ext:{xyz}"; this lets us - # detect the "file_ext:{}" marker and know we have a successful - # guess at the correct extension. The reason we need this marker - # is that libmagic has many tests whose return value is not - # governed through the magic data file and so we need some way of - # being sure a file extension has been returned. eg: - - # >>> magician.file('/etc/init.d') - # "symbolic link to `rc.d/init.d'" - - if filename is None and file is None: raise ValueError('at least one of file or content_type must be specified') - if not _got_magic: raise ImportError('magic module did not import successfully') - - magician = magic.open(magic.MAGIC_NONE) - - magic_data_file = os.path.join(_this_module_dir, 'magic/magic.ext') - ret_load = magician.load(magic_data_file) - - # Throw private error if the magic data file is corrupt, or - # doesn't exist. - - if ret_load != 0: raise _MagicDataError() - - if filename is None: - # then we have only been given file as binary string. - - # Get a temporary file and write file variable out to it - # because the magic module expects to be handed the name of a - # real file. - - tf, tf_name = _open_tempfile(mode='wb') - tf.write(file) - tf.close() - - delete_file = True - else: - os.stat(filename) # Make sure we can stat the file. - tf_name = filename - delete_file = False - - ext_info = magician.file(tf_name) - - # Now process ext_info to see if we can find a file extension - # contained in it. - - file_ext_re = re.compile(r'file_ext:{(.+?)}') - file_ext_match = file_ext_re.search(ext_info) - - if file_ext_match: - name_ext = file_ext_match.group(1) - - # See if we have a compressed file type we can deal - # with. If so, uncompress it and call ourself to get more - # info: - - # Note that we could use the magic.MAGIC_COMPRESS flag to - # get libmagic to do the decompression for us but: - # 1. It only supports gzip - # 2. The implementation has a nasty bug which has only - # been fixed in very recent releases of libmagic. - - if name_ext == 'gz': - - try: - # Decompress the stream: - decomp_file = gzip.open(tf_name).read() - except zlib.error: - # Couldn't decompress sucessfully, so just stick - # with extension we have. - pass - else: - # Guess an extension of the decompressed stream and - # tack current '.gz' on the end: - name_ext = calculate_filename_ext_libmagic(file=decomp_file) + '.' + name_ext - - elif name_ext == 'bz2': - - try: - # Decompress the file: - decomp_file = bz2.BZ2File(tf_name).read() - except IOError: - # Couldn't decompress sucessfully, so just stick - # with extension we have. - pass - else: - # Guess an extension of the decompressed stream and - # tack current '.bz2' on the end: - name_ext = calculate_filename_ext_libmagic(file=decomp_file) + '.' + name_ext - - # Otherwise, look for special results from libmagic's - # 'internal tests' that we recognize: - - elif ext_info.lower().rfind('tar archive') != -1: - name_ext = 'tar' - - elif ext_info.lower().rfind('text') != -1: - name_ext = 'txt' - - # Can't guess a filetype so use generic extension .dat - - else: - name_ext = 'dat' - - # Identification done so get rid of the temp file, assuming we created the file: - if delete_file: _remove_tempfile(tf_name) - - return name_ext - -mimetypes.types_map = {} -mimetypes.init([os.path.join(_this_module_dir, 'mime.types.edited')]) - -def calculate_filename_ext_mimetypes(content_type): - - # mimetypes.types_map contains many 'builtin' maps. We empty - # it because we only want to use the maps from our edited - # mime.types file: - - name_ext = mimetypes.guess_extension(content_type) - - # Use '.dat' as generic file extension. - if name_ext is None: name_ext = '.dat' - - # Remove leading dot produced by mimetypes. - name_ext = name_ext[1:] - - return name_ext - -# Errors: - -# This module may also produce IOError from it use of temporary -# files. -# REMEMBER TO DOCUMENT THIS ERROR POTENTIAL - -class _MagicDataError(Exception): - - """ - Private error raised when we cannot compile and load the magic - data file successfully. This will only occur if there is a problem - with the module's installation. - """ - - pass - - - diff --git a/modules/elmsubmit/lib/elmsubmit_html2txt.py b/modules/elmsubmit/lib/elmsubmit_html2txt.py index bba1dcd48..7e781c3b5 100644 --- a/modules/elmsubmit/lib/elmsubmit_html2txt.py +++ b/modules/elmsubmit/lib/elmsubmit_html2txt.py @@ -1,187 +1,184 @@ -# -*- coding: utf-8 -*- - -## $Id$ - +# -*- coding: utf-8 -*- +## +## $Id$ +## ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - import StringIO import formatter import htmllib import sgmllib import os from cdsware.elmsubmit_misc import write_to_and_return_tempfile_name as _write_to_and_return_tempfile_name from cdsware.elmsubmit_misc import remove_tempfile as _remove_tempfile from cdsware.elmsubmit_misc import mapmany as _mapmany # Search down to ###!!! See here !!!### for editable stuff. # Parser classes: class UnicodeHTMLParser(htmllib.HTMLParser): def unknown_charref(self, ref): # Take the HTML character reference and convert it to unicode. try: self.handle_data(unichr(int(ref))) except(OverflowError, ValueError): raise HTMLParsingFailed # myhtmlentitydefs.py should be found in the dir with this file: from myhtmlentitydefs import entitydefs class NativeParser: # NativeParser doesn't really need to be wrapped in a class, but # we need to provide the same parser_instance.parse() interface as # used for command line parsers. def parse(self, html, cols): file = StringIO.StringIO(u'') # Create HTML parser: writer = formatter.DumbWriter(file, maxcol=cols) myformatter = formatter.AbstractFormatter(writer) p = UnicodeHTMLParser(myformatter) try: p.feed(html) except sgmllib.SGMLParseError: raise HTMLParsingFailed p.close() return file.getvalue() class CLParser: # Provide a generic interface to command line parsers. # We could have saved some work by avoiding writing html to a temp # file for those command line parsers which allow input of html # documents on stdin. However, not all of them do and a uniform # interface was simplest. def __init__(self, commandline_list): self.commandline_list = commandline_list def parse(self, html, cols): if not isinstance(html, unicode): raise UnicodeInputRequired utf8html = html.encode('utf8') tf_name = _write_to_and_return_tempfile_name(utf8html) # Replace cols marker: f = lambda x: ((x == ['cols']) and str(cols)) or x # Replace filename marker: g = lambda x: ((x == ['filename']) and tf_name) or x commandline_list = _mapmany([f,g], self.commandline_list) commandline = ''.join(commandline_list) # Run the process using popen3; possibly dodgy on Windows! # Need popen3 rather other popen function because we want to # grab stderr and hide it from the clients console. (stdin, stdout, stderr) = os.popen3(commandline, 'r') utf8output = stdout.read() exit_status = stdout.close() _remove_tempfile(tf_name) # Just in case the parser outputs bogus utf8: # Check the return code: if exit_status is not None: raise HTMLParsingFailed # Convert back to unicode object and return: try: output = unicode(utf8output, 'utf8') return output except (LookupError, UnicodeError): raise HTMLParsingFailed ###!!! See here !!!### # Parsers: parser_native = NativeParser() # These can be reinstated some time down the line when command line # parsers have worked out their charset support a little better # (rather than the current 'if you get lynx with this patch available # from some guys website, then recompile...'): # It appears w3m requires patches to support utf8: # parser_w3m = CLParser(["w3m -dump -cols ", ['cols'], " -T 'text/html' file://", ['filename']]) # It appear lynx doesn't support charsets: # parser_lynx = CLParser(['lynx -dump -force-html -width=', ['cols'], ' file://', ['filename']]) # elinks works OK, except it appear not to support &#{unicoderef} tags, but these are rare(ish): # Actually, trying # parser_elinks = CLParser([ 'elinks -dump -dump-charset "utf-8" -force-html -dump-width ', ['cols'], ' file://', ['filename']]) # The version (2.1pre13) on my system of the other 'famous' command # line browser name links doesn't seem to have a dump option! available_parsers = [ # parser_w3m, # parser_lynx, # parser_elinks, parser_native ] # Key function: def html2txt(html, use_parsers=available_parsers, cols=72): # Try each parser in turn (given in the list use_parsers) to see # if they work: for parser in use_parsers: try: text = parser.parse(html, cols) except HTMLParsingFailed: continue else: return text # None of the parsers worked. raise HTMLParsingFailed # Errors: class HTMLParsingFailed(Exception): """ Raised if HTML parsing fails for any reason. """ pass class UnicodeInputRequired(Exception): """ Raised if attempt is made to parse anything other than unicode. """ - + diff --git a/modules/elmsubmit/lib/elmsubmit_html2txt.py.wml b/modules/elmsubmit/lib/elmsubmit_html2txt.py.wml deleted file mode 100644 index bba1dcd48..000000000 --- a/modules/elmsubmit/lib/elmsubmit_html2txt.py.wml +++ /dev/null @@ -1,187 +0,0 @@ -# -*- coding: utf-8 -*- - -## $Id$ - -## This file is part of the CERN Document Server Software (CDSware). -## Copyright (C) 2002, 2003, 2004, 2005 CERN. -## -## The CDSware is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License as -## published by the Free Software Foundation; either version 2 of the -## License, or (at your option) any later version. -## -## The CDSware is distributed in the hope that it will be useful, but -## WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -## General Public License for more details. -## -## You should have received a copy of the GNU General Public License -## along with CDSware; if not, write to the Free Software Foundation, Inc., -## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - -import StringIO -import formatter -import htmllib -import sgmllib -import os - -from cdsware.elmsubmit_misc import write_to_and_return_tempfile_name as _write_to_and_return_tempfile_name -from cdsware.elmsubmit_misc import remove_tempfile as _remove_tempfile -from cdsware.elmsubmit_misc import mapmany as _mapmany - -# Search down to ###!!! See here !!!### for editable stuff. - -# Parser classes: - -class UnicodeHTMLParser(htmllib.HTMLParser): - - def unknown_charref(self, ref): - # Take the HTML character reference and convert it to unicode. - try: - self.handle_data(unichr(int(ref))) - except(OverflowError, ValueError): - raise HTMLParsingFailed - - # myhtmlentitydefs.py should be found in the dir with this file: - from myhtmlentitydefs import entitydefs - -class NativeParser: - - # NativeParser doesn't really need to be wrapped in a class, but - # we need to provide the same parser_instance.parse() interface as - # used for command line parsers. - - def parse(self, html, cols): - - file = StringIO.StringIO(u'') - - # Create HTML parser: - writer = formatter.DumbWriter(file, maxcol=cols) - myformatter = formatter.AbstractFormatter(writer) - p = UnicodeHTMLParser(myformatter) - - try: - p.feed(html) - except sgmllib.SGMLParseError: - raise HTMLParsingFailed - - p.close() - - return file.getvalue() - -class CLParser: - - # Provide a generic interface to command line parsers. - - # We could have saved some work by avoiding writing html to a temp - # file for those command line parsers which allow input of html - # documents on stdin. However, not all of them do and a uniform - # interface was simplest. - - def __init__(self, commandline_list): - - self.commandline_list = commandline_list - - def parse(self, html, cols): - - if not isinstance(html, unicode): raise UnicodeInputRequired - - utf8html = html.encode('utf8') - tf_name = _write_to_and_return_tempfile_name(utf8html) - - # Replace cols marker: - f = lambda x: ((x == ['cols']) and str(cols)) or x - # Replace filename marker: - g = lambda x: ((x == ['filename']) and tf_name) or x - - commandline_list = _mapmany([f,g], self.commandline_list) - commandline = ''.join(commandline_list) - - # Run the process using popen3; possibly dodgy on Windows! - # Need popen3 rather other popen function because we want to - # grab stderr and hide it from the clients console. - - (stdin, stdout, stderr) = os.popen3(commandline, 'r') - - utf8output = stdout.read() - exit_status = stdout.close() - _remove_tempfile(tf_name) - - # Just in case the parser outputs bogus utf8: - - # Check the return code: - if exit_status is not None: raise HTMLParsingFailed - - # Convert back to unicode object and return: - try: - output = unicode(utf8output, 'utf8') - return output - except (LookupError, UnicodeError): - raise HTMLParsingFailed - - -###!!! See here !!!### - -# Parsers: - -parser_native = NativeParser() - -# These can be reinstated some time down the line when command line -# parsers have worked out their charset support a little better -# (rather than the current 'if you get lynx with this patch available -# from some guys website, then recompile...'): - -# It appears w3m requires patches to support utf8: -# parser_w3m = CLParser(["w3m -dump -cols ", ['cols'], " -T 'text/html' file://", ['filename']]) - -# It appear lynx doesn't support charsets: -# parser_lynx = CLParser(['lynx -dump -force-html -width=', ['cols'], ' file://', ['filename']]) - -# elinks works OK, except it appear not to support &#{unicoderef} tags, but these are rare(ish): -# Actually, trying -# parser_elinks = CLParser([ 'elinks -dump -dump-charset "utf-8" -force-html -dump-width ', ['cols'], ' file://', ['filename']]) - -# The version (2.1pre13) on my system of the other 'famous' command -# line browser name links doesn't seem to have a dump option! - - -available_parsers = [ # parser_w3m, - # parser_lynx, - # parser_elinks, - parser_native ] - -# Key function: - -def html2txt(html, use_parsers=available_parsers, cols=72): - - # Try each parser in turn (given in the list use_parsers) to see - # if they work: - - for parser in use_parsers: - try: - text = parser.parse(html, cols) - except HTMLParsingFailed: - continue - else: - return text - - # None of the parsers worked. - raise HTMLParsingFailed - -# Errors: - -class HTMLParsingFailed(Exception): - """ - Raised if HTML parsing fails for any reason. - """ - pass - -class UnicodeInputRequired(Exception): - """ - Raised if attempt is made to parse anything other than unicode. - """ - - diff --git a/modules/elmsubmit/lib/elmsubmit_misc.py b/modules/elmsubmit/lib/elmsubmit_misc.py index 607465f32..5a58dbc21 100644 --- a/modules/elmsubmit/lib/elmsubmit_misc.py +++ b/modules/elmsubmit/lib/elmsubmit_misc.py @@ -1,576 +1,572 @@ -# -*- coding: utf-8 -*- - -## $Id$ - +# -*- coding: utf-8 -*- +## +## $Id$ +## ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - - """ Miscellaneous utlity functions that have the potential for re-use. """ import tempfile import os import os.path import random import stat import ConfigParser import textwrap import re def concat(list_of_lists): return [item for list in list_of_lists for item in list] def cleave_pair(list): # Should really generalize this to the nth case; but I only need # pairs right now! """ [1,2,3,4,5,6,7] becomes ([1,3,5,7], [2,4,6]) """ lefts = [] rights = [] k = (lefts, rights) for x in range(0, len(list)): k[x % 2].append(list[x]) return (lefts, rights) def merge_pair(lefts, rights): """ [1,3,5,7], [2,4,6] becomes [1,2,3,4,5,6,7] """ k = (lefts, rights) list = [] for x in range(0, len(lefts) + len(rights)): (d, m) = divmod(x, 2) list.append(k[m][d]) return list def cr2lf(file): """ Replace CRLF with LF. ie. Convert text file from DOS to Unix end of line conventions. """ return file.replace("\r\n", "\n") # Directory backup using mirrordir: def backup_directory(original_directory, backup_directory): # Backing up the directory requires GNU mirrordir to be installed; # shutil.copytree won't do the job if there are pipes or fifos # etc. in my_directory. # Implementing mirrordir directly in python would be a # good project! # mkdir will throw the correct errors for us: os.mkdir(backup_directory) commandline = 'mirrordir ' + original_directory + ' ' + backup_directory # Run the process using popen3; possibly dodgy on Windows! # Need popen3 rather other popen function because we want to # grab stderr and hide it from the clients console. (stdin, stdout, stderr) = os.popen3(commandline, 'r') # Close straight away; mirrordir expects no input. # return the exist status: return stdout.close() # Tempfile stuff: def open_tempfile(mode='wb'): # We open in binary mode and write a non-unicode string and so # can be sure that python will write the data verbatim, # without fiddling with CRLFs etc. (tf_file_descriptor, tf_name) = tempfile.mkstemp() tf = os.fdopen(tf_file_descriptor, mode) return (tf, tf_name) def write_to_and_return_tempfile_name(data): (tf, tf_name) = open_tempfile() tf.write(data) tf.close() return tf_name def remove_tempfile(filename): """ Tries to unlink the named tempfile. Catches the OSError if unlinking fails. """ try: os.unlink(filename) except OSError: # Couldn't delete temp file; no big problem. pass # Random string stuff: def random_alphanum_string(length, chars='abcdefghijklmnopqrstuvwxyz' ): """ Create a random string of given length, choosing each character with equal probability from the list given in string chars. For example: chars='aab' would cause each character to be 'a' with 2/3 probability and 'b' with 1/3 probability (pseudorandomly speaking). """ alphanums = list(chars) # Replicate list into a list of lists and map the random choice # function over it: choices = map(random.choice, [alphanums] * length) # Concat the choices into a string: return ''.join(choices) def mapmany(functions, in_list): # If functions equals [phi, ... , alpha, beta, gamma] return # map(phi, ... map(alpha, map(beta, map(gamma, in_list))) ... ) functions.reverse() g = lambda list, f: map(f, list) return reduce(g, functions, in_list) def dict2file(dictionary, directory): """ Take any dictionary, eg.: { 'title' : 'The loveliest title.', 'name' : 'Pete the dog.', 'info' : { 'age' : '21', 'evil' : 'yes' } } and create a set of files in the given directory: directory/title directory/name directory/info/age directory/info/evil so that each filename is a dictionary key, and the contents of each file is the value that the key pointed to. """ def f((path, dictionary_or_data)): fullpath = os.path.join(directory, path) try: dictionary_or_data.has_key except AttributeError: open(fullpath, 'wb').write(dictionary_or_data) else: os.mkdir(fullpath) dict2file(dictionary_or_data, fullpath) map(f, dictionary.items()) return None def recursive_dir_contents(dir): files = [] def f(arg, dirname, fnames): files.extend(map(lambda file: os.path.join(dirname, file), fnames)) os.path.walk(dir, f, None) return files def count_dotdot(path): path_parts = path.split(os.sep) dotdots = filter(lambda part: part == '..', path_parts) return len(dotdots) def common_prefix(seq, default_empty=''): try: leng = 0 for tuple in zip(*seq): if tuple[1:] != tuple[:-1]: break leng += 1 return seq[0][:leng] except TypeError: return default_empty def split_common_path(thePaths): # sanitze paths: f = lambda x: os.path.normpath(os.path.expanduser(x)) thePaths = map(f, thePaths) # thePaths is a list of paths (strings) thePaths = map(lambda p: p.split(os.sep), thePaths) # chop common part off the paths theBase = common_prefix(thePaths, []) thePaths = map(lambda p, c=len(theBase): p[c:], thePaths) # convert back to strings if theBase == ['']: theBase = '/' else: theBase = os.sep.join(theBase) thePaths = map(os.sep.join, thePaths) return (theBase, thePaths) def mkdir_parents(path): tree = dirtree(path) tree.reverse() for parent in tree: if os.path.exists(parent): if os.path.isdir(parent): continue else: # This will raise the correct OSError for us. os.chdir(parent) else: os.mkdir(parent) def dirtree(dir): # sanitize path: dir = os.path.normpath(os.path.expanduser(dir)) return _dirtree(dir) def _dirtree(dir): """ An example will explain: >>> elmsubmit_misc.dirtree('/hof/wim/sif/eff/hoo') ['/hof/wim/sif/eff/hoo', '/hof/wim/sif/eff', '/hof/wim/sif', '/hof/wim', '/hof', '/'] """ # POSIX allows // or / for the root dir. # And it seems the rules say you aren't allowed to collapse // into /. # I don't know why this is! if dir == '//' or dir == '/': return [dir] elif dir == '': return [] else: return [dir] + _dirtree(os.path.dirname(dir)) def provide_dir_with_perms_then_exec(dir, function, perms, barrier_dir): # This function won't allow you to alter the root directories' # permissions: if your going to be changing the permissions on # your root directory, you probably need to do it more carefully # than with a python function! # sanitize path: dir = os.path.abspath(os.path.normpath(os.path.expanduser(dir))) # Check to see if we're already in the state we want to be in: try: targets_current_perms = get_perms(dir) targets_current_owner_uid = get_owner_uid(dir) except OSError, e: if e.errno == 2: # dir definitely doesn't exist. raise elif e.errno == 13: # don't have sufficient permissions to read the # permissions. dir_info_read = False else: dir_info_read = True if dir_info_read and targets_current_owner_uid != os.geteuid(): # We don't own the file: raise OSError("file %s not owned by this process's effective user: cannot proceed" % (dir)) elif dir_info_read and targets_current_perms & perms == perms: # This directory already has user bits set to at least perms, # so execute the given function: return function() # If we haven't exited the function already, we need to change the target dirs # permissions (or simply couldn't read the permissions!) # Get a list of all of the dirs parents: dir_list = dirtree(dir) if barrier_dir is not None: # sanitize path: barrier_dir = os.path.abspath(os.path.normpath(os.path.expanduser(barrier_dir))) # Check the barrier dir is one of the parents of dir: if not barrier_dir in dir_list[1:]: raise ValueError('argument barrier_dir must be a proper parent directory of argument dir') # Get a list of all the directories that lie between the # barrier dir and the target dir, including the barrier dir, # but excluding the target dir: barrier_dir_list = dirtree(barrier_dir) g = lambda d: (d == barrier_dir) or (not (d in barrier_dir_list or d == dir)) operable_parent_dirs = filter(g, dir_list) else: operable_parent_dirs = dir_list # Make sure we have at least wx permissions on parent: parents_old_states = _get_perms_on(operable_parent_dirs, perms=0300) # Now stat the target dir if we didn't manage previously: if not dir_info_read: try: targets_current_perms = get_perms(dir) targets_current_owner_uid = get_owner_uid(dir) except OSError, e: if e.errno == 2: # race condition: raise OSError("Directory structure altered during processing: %s removed during processing" % (dir)) elif e.errno == 13: # race condition: raise OSError("Directory structure %s altered during processing: permissions changed during processing" % (dirlist)) if targets_current_owner_uid != os.geteuid(): # We don't own this file and so can't chmod it: We # couldn't see this previously because we didn't # have permission to stat the dir. Undo the # permission changes we've already made and report # the error: _safely_chmod_dirlist(parents_old_states) raise OSError("file %s not owned by this process's effective user: cannot proceed" % (dir)) elif targets_current_perms & perms == perms: # We already have the perms we need. try: return_value = function() finally: _safely_chmod_dirlist(parents_old_states) return return_value # Now change the permissions of our target directory: try: os.chmod(dir, perms | targets_current_perms) except OSError: # race condition: raise OSError("Directory structure %s altered during processing: permissions changed during processing" % (dirlist)) try: # Now permissions are open, exec our function: return_value = function() finally: # Close up the permissions we had to open: _safely_chmod_dirlist([[dir, targets_current_perms]] + parents_old_states) # Return the input functions return value: return return_value def _get_perms_on(dirlist, perms=0300): # Note: any comment labelling a particular error as "race # condition" is meant to indicate an error that can only arise if # another process is attempting to alter the directory strucutre # at the same time as us - this function _must not_ be used if # such a situation is possible. # User perms < rx doesn't make sense for this function. You need # at least wx bits on a directory to change the permissions on its # child directories. if perms < 0300: raise ValueError("argument perms must be >= 3 in the user byte") dir = dirlist[0] remaining_dirs = dirlist[1:] try: targets_current_perms = get_perms(dir) targets_current_owner_uid = get_owner_uid(dir) except OSError, e: if e.errno == 2: # dir definitely doesn't exist. raise elif e.errno == 13: # don't have sufficient permissions to read the # permissions. dir_info_read = False else: dir_info_read = True if dir_info_read and targets_current_owner_uid != os.geteuid(): # We don't own the file: raise OSError("file %s not owned by this process's effective user: cannot proceed" % (dir)) elif dir_info_read and targets_current_perms & perms == perms: # This directory already has user bits set to at least perms, # so nothing to do: return [] elif dir_info_read and targets_current_perms & perms != perms: # We need to adjust the permissions. See if the parent will # let us: if remaining_dirs == []: # We have no parents available: raise OSError("no members of the given dirtree have sufficient permissions for us to chmod") else: parent = remaining_dirs[0] # Figure out if we're the owner of the parent and have permissions try: parents_current_perms = get_perms(parent) parents_current_owner_uid = get_owner_uid(parent) except OSError, e: if e.errno == 2: # dir definitely doesn't exist. raise elif e.errno == 13: # don't have sufficient permissions to read the # permissions. parent_dir_info_read = False else: parent_dir_info_read = True if parent_dir_info_read and parents_current_owner_uid == os.geteuid() and parents_current_perms & 0300 == 0300: # We own the parent and have sufficient permission to chmod its contents: try: os.chmod(dir, perms | targets_current_perms) except OSError: # race condition: raise OSError("Directory structure %s altered during processing: permissions changed during processing" % (dirlist)) return [[dir, targets_current_perms]] else: # We need to step down a level: pass else: # dir info was not read. if remaining_dirs == []: raise OSError("no members of the given dirtree have sufficient permissions for us to chmod") # If the prior if-then-else didn't return or throw an error then # either we couldn't stat the given dir or we don't have # permission to change its permissions, so therefore we need to # step down a level: parents_old_states = _get_perms_on(remaining_dirs, perms) if not dir_info_read: try: targets_current_perms = get_perms(dir) targets_current_owner_uid = get_owner_uid(dir) except OSError, e: if e.errno == 2: # race condition: raise OSError("Directory structure altered during processing: %s removed during processing" % (dir)) elif e.errno == 13: # race condition: raise OSError("Directory structure %s altered during processing: permissions changed during processing" % (dirlist)) if targets_current_owner_uid != os.geteuid(): # We don't own this file and so can't chmod it: We # couldn't see this previously because we didn't # have permission to stat the dir. Undo the # permission changes we've already made and report # the error: _safely_chmod_dirlist(parents_old_states) raise OSError("file %s not owned by this process's effective user: cannot proceed" % (dir)) elif targets_current_perms & perms == perms: # current directory already has the permissions we # want; previously the parent's perms were preventing # us from seeing this: return parents_old_states else: # current directory's permissions need altering: # Set the user bits to at least perms: try: os.chmod(dir, perms | targets_current_perms) except OSError: # race condition: raise OSError("Directory structure %s altered during processing: permissions changed during processing" % (dirlist)) return [[dir, targets_current_perms]] + parents_old_states else: # current directory's permissions need altering: # Set the user bits to at least perms: try: os.chmod(dir, perms | targets_current_perms) except OSError: # race condition: raise OSError("Directory structure %s altered during processing: permissions changed during processing" % (dirlist)) return [[dir, targets_current_perms]] + parents_old_states def _safely_chmod_dirlist(dirlist): f = lambda (dir, perms): os.chmod(dir, perms) map(f, dirlist) def get_perms(path): return stat.S_IMODE(os.stat(path)[stat.ST_MODE]) def get_owner_uid(path): return os.stat(path)[stat.ST_UID] # Text utils: def wrap_text(text, cols=80): print "text", text parts = re.split(r'(\n(?:\s*\n))+', text) (paragraphs, whitespace) = cleave_pair(parts) for x in parts: print ">>", x print "paras", paragraphs print "white", whitespace wrapped_paragraphs = map(lambda t: textwrap.fill(t, width=cols), paragraphs) print wrapped_paragraphs return ''.join(merge_pair(wrapped_paragraphs, whitespace)) # Module utils: def import_dots(string): """ Note that if you execute: mod = __import__('one.two.three') then variable mod will point to module one, not module 'one.two.three'. whereas: mod = import_dots('one.two.three') will point to module 'one.two.three'. """ mod = __import__(string) components = string.split('.') for comp in components[1:]: mod = getattr(mod, comp) return mod - + diff --git a/modules/elmsubmit/lib/elmsubmit_misc.py.wml b/modules/elmsubmit/lib/elmsubmit_misc.py.wml deleted file mode 100644 index 607465f32..000000000 --- a/modules/elmsubmit/lib/elmsubmit_misc.py.wml +++ /dev/null @@ -1,576 +0,0 @@ -# -*- coding: utf-8 -*- - -## $Id$ - -## This file is part of the CERN Document Server Software (CDSware). -## Copyright (C) 2002, 2003, 2004, 2005 CERN. -## -## The CDSware is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License as -## published by the Free Software Foundation; either version 2 of the -## License, or (at your option) any later version. -## -## The CDSware is distributed in the hope that it will be useful, but -## WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -## General Public License for more details. -## -## You should have received a copy of the GNU General Public License -## along with CDSware; if not, write to the Free Software Foundation, Inc., -## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - - -""" -Miscellaneous utlity functions that have the potential for re-use. -""" - -import tempfile -import os -import os.path -import random -import stat -import ConfigParser -import textwrap -import re - -def concat(list_of_lists): - - return [item for list in list_of_lists for item in list] - -def cleave_pair(list): - - # Should really generalize this to the nth case; but I only need - # pairs right now! - - """ - [1,2,3,4,5,6,7] - - becomes - - ([1,3,5,7], [2,4,6]) - """ - - lefts = [] - rights = [] - k = (lefts, rights) - - for x in range(0, len(list)): - k[x % 2].append(list[x]) - - return (lefts, rights) - -def merge_pair(lefts, rights): - """ - [1,3,5,7], [2,4,6] - - becomes - - [1,2,3,4,5,6,7] - """ - - k = (lefts, rights) - list = [] - - for x in range(0, len(lefts) + len(rights)): - (d, m) = divmod(x, 2) - list.append(k[m][d]) - - return list - -def cr2lf(file): - - """ - Replace CRLF with LF. ie. Convert text file from DOS to Unix end - of line conventions. - """ - - return file.replace("\r\n", "\n") - -# Directory backup using mirrordir: - -def backup_directory(original_directory, backup_directory): - - # Backing up the directory requires GNU mirrordir to be installed; - # shutil.copytree won't do the job if there are pipes or fifos - # etc. in my_directory. - - # Implementing mirrordir directly in python would be a - # good project! - - # mkdir will throw the correct errors for us: - os.mkdir(backup_directory) - - commandline = 'mirrordir ' + original_directory + ' ' + backup_directory - - # Run the process using popen3; possibly dodgy on Windows! - # Need popen3 rather other popen function because we want to - # grab stderr and hide it from the clients console. - (stdin, stdout, stderr) = os.popen3(commandline, 'r') - # Close straight away; mirrordir expects no input. - - # return the exist status: - return stdout.close() - -# Tempfile stuff: - -def open_tempfile(mode='wb'): - - # We open in binary mode and write a non-unicode string and so - # can be sure that python will write the data verbatim, - # without fiddling with CRLFs etc. - - (tf_file_descriptor, tf_name) = tempfile.mkstemp() - tf = os.fdopen(tf_file_descriptor, mode) - return (tf, tf_name) - -def write_to_and_return_tempfile_name(data): - - (tf, tf_name) = open_tempfile() - tf.write(data) - tf.close() - return tf_name - -def remove_tempfile(filename): - """ - Tries to unlink the named tempfile. Catches the OSError if - unlinking fails. - """ - try: - os.unlink(filename) - except OSError: - # Couldn't delete temp file; no big problem. - pass - -# Random string stuff: - -def random_alphanum_string(length, chars='abcdefghijklmnopqrstuvwxyz' ): - """ - Create a random string of given length, choosing each character - with equal probability from the list given in string chars. For - example: chars='aab' would cause each character to be 'a' with 2/3 - probability and 'b' with 1/3 probability (pseudorandomly - speaking). - """ - - alphanums = list(chars) - - # Replicate list into a list of lists and map the random choice - # function over it: - choices = map(random.choice, [alphanums] * length) - - # Concat the choices into a string: - return ''.join(choices) - -def mapmany(functions, in_list): - - # If functions equals [phi, ... , alpha, beta, gamma] return - # map(phi, ... map(alpha, map(beta, map(gamma, in_list))) ... ) - - functions.reverse() - - g = lambda list, f: map(f, list) - - return reduce(g, functions, in_list) - -def dict2file(dictionary, directory): - """ - Take any dictionary, eg.: - - { 'title' : 'The loveliest title.', - 'name' : 'Pete the dog.', - 'info' : { 'age' : '21', 'evil' : 'yes' } - } - - and create a set of files in the given directory: - directory/title - directory/name - directory/info/age - directory/info/evil - so that each filename is a dictionary key, and the contents of - each file is the value that the key pointed to. - """ - - def f((path, dictionary_or_data)): - - fullpath = os.path.join(directory, path) - - try: - dictionary_or_data.has_key - except AttributeError: - open(fullpath, 'wb').write(dictionary_or_data) - else: - os.mkdir(fullpath) - dict2file(dictionary_or_data, fullpath) - - map(f, dictionary.items()) - - return None - -def recursive_dir_contents(dir): - - files = [] - - def f(arg, dirname, fnames): - files.extend(map(lambda file: os.path.join(dirname, file), fnames)) - - os.path.walk(dir, f, None) - - return files - -def count_dotdot(path): - path_parts = path.split(os.sep) - dotdots = filter(lambda part: part == '..', path_parts) - return len(dotdots) - -def common_prefix(seq, default_empty=''): - try: - leng = 0 - for tuple in zip(*seq): - if tuple[1:] != tuple[:-1]: break - leng += 1 - return seq[0][:leng] - except TypeError: return default_empty - - -def split_common_path(thePaths): - # sanitze paths: - f = lambda x: os.path.normpath(os.path.expanduser(x)) - thePaths = map(f, thePaths) - - # thePaths is a list of paths (strings) - thePaths = map(lambda p: p.split(os.sep), thePaths) - - # chop common part off the paths - theBase = common_prefix(thePaths, []) - thePaths = map(lambda p, c=len(theBase): p[c:], thePaths) - # convert back to strings - if theBase == ['']: - theBase = '/' - else: - theBase = os.sep.join(theBase) - thePaths = map(os.sep.join, thePaths) - return (theBase, thePaths) - -def mkdir_parents(path): - tree = dirtree(path) - tree.reverse() - - for parent in tree: - if os.path.exists(parent): - if os.path.isdir(parent): - continue - else: - # This will raise the correct OSError for us. - os.chdir(parent) - else: - os.mkdir(parent) - -def dirtree(dir): - # sanitize path: - dir = os.path.normpath(os.path.expanduser(dir)) - return _dirtree(dir) - -def _dirtree(dir): - """ - An example will explain: - - >>> elmsubmit_misc.dirtree('/hof/wim/sif/eff/hoo') - ['/hof/wim/sif/eff/hoo', - '/hof/wim/sif/eff', - '/hof/wim/sif', - '/hof/wim', - '/hof', - '/'] - """ - - # POSIX allows // or / for the root dir. - # And it seems the rules say you aren't allowed to collapse // into /. - # I don't know why this is! - if dir == '//' or dir == '/': - return [dir] - elif dir == '': - return [] - else: - return [dir] + _dirtree(os.path.dirname(dir)) - -def provide_dir_with_perms_then_exec(dir, function, perms, barrier_dir): - # This function won't allow you to alter the root directories' - # permissions: if your going to be changing the permissions on - # your root directory, you probably need to do it more carefully - # than with a python function! - - # sanitize path: - dir = os.path.abspath(os.path.normpath(os.path.expanduser(dir))) - - # Check to see if we're already in the state we want to be in: - try: - targets_current_perms = get_perms(dir) - targets_current_owner_uid = get_owner_uid(dir) - except OSError, e: - if e.errno == 2: - # dir definitely doesn't exist. - raise - elif e.errno == 13: - # don't have sufficient permissions to read the - # permissions. - dir_info_read = False - else: - dir_info_read = True - - if dir_info_read and targets_current_owner_uid != os.geteuid(): - # We don't own the file: - raise OSError("file %s not owned by this process's effective user: cannot proceed" % (dir)) - elif dir_info_read and targets_current_perms & perms == perms: - # This directory already has user bits set to at least perms, - # so execute the given function: - return function() - - # If we haven't exited the function already, we need to change the target dirs - # permissions (or simply couldn't read the permissions!) - - # Get a list of all of the dirs parents: - dir_list = dirtree(dir) - - if barrier_dir is not None: - # sanitize path: - barrier_dir = os.path.abspath(os.path.normpath(os.path.expanduser(barrier_dir))) - - # Check the barrier dir is one of the parents of dir: - if not barrier_dir in dir_list[1:]: - raise ValueError('argument barrier_dir must be a proper parent directory of argument dir') - - # Get a list of all the directories that lie between the - # barrier dir and the target dir, including the barrier dir, - # but excluding the target dir: - barrier_dir_list = dirtree(barrier_dir) - - g = lambda d: (d == barrier_dir) or (not (d in barrier_dir_list or d == dir)) - operable_parent_dirs = filter(g, dir_list) - else: - operable_parent_dirs = dir_list - # Make sure we have at least wx permissions on parent: - parents_old_states = _get_perms_on(operable_parent_dirs, perms=0300) - - # Now stat the target dir if we didn't manage previously: - if not dir_info_read: - try: - targets_current_perms = get_perms(dir) - targets_current_owner_uid = get_owner_uid(dir) - except OSError, e: - if e.errno == 2: - # race condition: - raise OSError("Directory structure altered during processing: %s removed during processing" % (dir)) - elif e.errno == 13: - # race condition: - raise OSError("Directory structure %s altered during processing: permissions changed during processing" % (dirlist)) - - if targets_current_owner_uid != os.geteuid(): - # We don't own this file and so can't chmod it: We - # couldn't see this previously because we didn't - # have permission to stat the dir. Undo the - # permission changes we've already made and report - # the error: - _safely_chmod_dirlist(parents_old_states) - raise OSError("file %s not owned by this process's effective user: cannot proceed" % (dir)) - elif targets_current_perms & perms == perms: - # We already have the perms we need. - try: - return_value = function() - finally: - _safely_chmod_dirlist(parents_old_states) - return return_value - - # Now change the permissions of our target directory: - try: - os.chmod(dir, perms | targets_current_perms) - except OSError: - # race condition: - raise OSError("Directory structure %s altered during processing: permissions changed during processing" % (dirlist)) - - try: - # Now permissions are open, exec our function: - return_value = function() - finally: - # Close up the permissions we had to open: - _safely_chmod_dirlist([[dir, targets_current_perms]] + parents_old_states) - - # Return the input functions return value: - return return_value - -def _get_perms_on(dirlist, perms=0300): - - # Note: any comment labelling a particular error as "race - # condition" is meant to indicate an error that can only arise if - # another process is attempting to alter the directory strucutre - # at the same time as us - this function _must not_ be used if - # such a situation is possible. - - # User perms < rx doesn't make sense for this function. You need - # at least wx bits on a directory to change the permissions on its - # child directories. - if perms < 0300: raise ValueError("argument perms must be >= 3 in the user byte") - - dir = dirlist[0] - remaining_dirs = dirlist[1:] - - try: - targets_current_perms = get_perms(dir) - targets_current_owner_uid = get_owner_uid(dir) - except OSError, e: - if e.errno == 2: - # dir definitely doesn't exist. - raise - elif e.errno == 13: - # don't have sufficient permissions to read the - # permissions. - dir_info_read = False - else: - dir_info_read = True - - if dir_info_read and targets_current_owner_uid != os.geteuid(): - # We don't own the file: - raise OSError("file %s not owned by this process's effective user: cannot proceed" % (dir)) - elif dir_info_read and targets_current_perms & perms == perms: - # This directory already has user bits set to at least perms, - # so nothing to do: - return [] - elif dir_info_read and targets_current_perms & perms != perms: - # We need to adjust the permissions. See if the parent will - # let us: - if remaining_dirs == []: - # We have no parents available: - raise OSError("no members of the given dirtree have sufficient permissions for us to chmod") - else: - parent = remaining_dirs[0] - # Figure out if we're the owner of the parent and have permissions - try: - parents_current_perms = get_perms(parent) - parents_current_owner_uid = get_owner_uid(parent) - except OSError, e: - if e.errno == 2: - # dir definitely doesn't exist. - raise - elif e.errno == 13: - # don't have sufficient permissions to read the - # permissions. - parent_dir_info_read = False - else: - parent_dir_info_read = True - - if parent_dir_info_read and parents_current_owner_uid == os.geteuid() and parents_current_perms & 0300 == 0300: - # We own the parent and have sufficient permission to chmod its contents: - try: - os.chmod(dir, perms | targets_current_perms) - except OSError: - # race condition: - raise OSError("Directory structure %s altered during processing: permissions changed during processing" % (dirlist)) - return [[dir, targets_current_perms]] - else: - # We need to step down a level: - pass - - else: # dir info was not read. - if remaining_dirs == []: - raise OSError("no members of the given dirtree have sufficient permissions for us to chmod") - - # If the prior if-then-else didn't return or throw an error then - # either we couldn't stat the given dir or we don't have - # permission to change its permissions, so therefore we need to - # step down a level: - - parents_old_states = _get_perms_on(remaining_dirs, perms) - - if not dir_info_read: - try: - targets_current_perms = get_perms(dir) - targets_current_owner_uid = get_owner_uid(dir) - except OSError, e: - if e.errno == 2: - # race condition: - raise OSError("Directory structure altered during processing: %s removed during processing" % (dir)) - elif e.errno == 13: - # race condition: - raise OSError("Directory structure %s altered during processing: permissions changed during processing" % (dirlist)) - if targets_current_owner_uid != os.geteuid(): - # We don't own this file and so can't chmod it: We - # couldn't see this previously because we didn't - # have permission to stat the dir. Undo the - # permission changes we've already made and report - # the error: - _safely_chmod_dirlist(parents_old_states) - raise OSError("file %s not owned by this process's effective user: cannot proceed" % (dir)) - elif targets_current_perms & perms == perms: - # current directory already has the permissions we - # want; previously the parent's perms were preventing - # us from seeing this: - return parents_old_states - else: - # current directory's permissions need altering: - # Set the user bits to at least perms: - try: - os.chmod(dir, perms | targets_current_perms) - except OSError: - # race condition: - raise OSError("Directory structure %s altered during processing: permissions changed during processing" % (dirlist)) - return [[dir, targets_current_perms]] + parents_old_states - else: - # current directory's permissions need altering: - # Set the user bits to at least perms: - try: - os.chmod(dir, perms | targets_current_perms) - except OSError: - # race condition: - raise OSError("Directory structure %s altered during processing: permissions changed during processing" % (dirlist)) - return [[dir, targets_current_perms]] + parents_old_states - -def _safely_chmod_dirlist(dirlist): - f = lambda (dir, perms): os.chmod(dir, perms) - map(f, dirlist) - -def get_perms(path): - return stat.S_IMODE(os.stat(path)[stat.ST_MODE]) - -def get_owner_uid(path): - return os.stat(path)[stat.ST_UID] - -# Text utils: - -def wrap_text(text, cols=80): - print "text", text - parts = re.split(r'(\n(?:\s*\n))+', text) - (paragraphs, whitespace) = cleave_pair(parts) - for x in parts: - print ">>", x - print "paras", paragraphs - print "white", whitespace - wrapped_paragraphs = map(lambda t: textwrap.fill(t, width=cols), paragraphs) - print wrapped_paragraphs - return ''.join(merge_pair(wrapped_paragraphs, whitespace)) - -# Module utils: - -def import_dots(string): - """ - Note that if you execute: - - mod = __import__('one.two.three') - - then variable mod will point to module one, not module - 'one.two.three'. - - whereas: - - mod = import_dots('one.two.three') - - will point to module 'one.two.three'. - """ - - mod = __import__(string) - components = string.split('.') - for comp in components[1:]: - mod = getattr(mod, comp) - return mod - diff --git a/modules/elmsubmit/lib/elmsubmit_richtext2txt.py b/modules/elmsubmit/lib/elmsubmit_richtext2txt.py index 4f12985f5..6992d6c24 100644 --- a/modules/elmsubmit/lib/elmsubmit_richtext2txt.py +++ b/modules/elmsubmit/lib/elmsubmit_richtext2txt.py @@ -1,478 +1,475 @@ -# -*- coding: utf-8 -*- - -## $Id$ - +# -*- coding: utf-8 -*- +## +## $Id$ +## ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - """ A text/richtext to text/plain converter. Always returns a unicode string. This is a module exporting a single function 'richtext2txt' which takes a string of 'enriched text' and returns its conversion to 'plain text'. 'rich text' is the text format as specified in RFC1341 for use as an email payload with mime type text/richtext. The code is based on the example parser given in appendix D of RFC1341. It is a quite heavily modified version; the new code (aside from being in Python not C): 1. Takes account of the tag. 2. Deals better with soft newlines. 3. Deals better with the paragraph tag. 4. Takes account of the tag. The resulting code is something of a mishmash of the functional style of programming that I prefer and the 'big while loop' proceedural style in which the original C code is written. With reference to point 4: Richtext is a pain because it allows markup tags to change charsets inside a document. This means that if we get a text/richtext email payload with 'Content-type' header specifying a charset e.g. 'us-ascii', we can't simply decode to a unicode object; it is possible that bytes inside the will break the unicode(str,'us-ascii') function call! This is frustrating because: 1. Why bother to have a charset declaration outside a document only to go and break it inside? This might be understandable if text/richtext was designed independantly of MIME and its Content-Type declarations but: 2. text/richtext is specified in the SAME RFC as the Content-type: MIME header! In fairness to the RFC writer(s), they were working at a time when unicode/iso10646 was still in flux and so it was common for people writing bilingual texts to want to use two charsets in one document. It is interesting to note that the later text/enriched specification (written when unicode had petrified) removes the possibility of charset switching. The existence of tags makes the parser rather more complicated. Treatment notes: > Second, the command "" is used to represent a required > line break. (Otherwise, CRLFs in the data are treated as > equivalent to a single SPACE character.) 2. The RFC doesn't say to treat spaces as a special character; ie. that they should be reproduced verbatim. This leads to the odd effect that a string such as follows (where $SPACE$ in reality would be a space character): "Some text...$SPACE$More text..." Is rendered as: "Some text... $SPACE$ More text..." ie. The space is considered a string of text which must be separated from the displayed paragraphs. This seems fairly odd behaviour to me, but the RFC seems to suggest this is correct treatment. """ import re import StringIO def richtext2txt(str, charset='us-ascii', convert_iso_8859_tags=False, force_conversion=False): return _richtext2txt(str, charset, convert_iso_8859_tags, force_conversion) """ Document options somewhere here. ##### 5. Make a note that the parsers assume \n not CRLF conventions so preconvert!!! ##### ------------------------------------------------------------------------------- """ def _richtext2txt(string, charset='us-ascii', convert_iso_8859_tags=False, force_conversion=False, recursive=False, just_closed_para=True, output_file=None): if type(string) == unicode and convert_iso_8859_tags: # Doesn't make sense to have a unicode string # containing mixed charsets. raise ValueError("function richtext2txt cannot have both unicode input string and convert_iso_8859_tags=True.") # f and g will be our input/output streams. # Create file like object from string for input file. f = StringIO.StringIO(string) # Create another file like object from string for output file, # unless we have been handed one by recursive call. if output_file is None: g = StringIO.StringIO(u'') else: g = output_file # When comparing to the RFC1341 code, substitute: # STDIN -> object f # STDOUT -> object g # EOF -> '' # ungetc -> seek(-1,1) # If we're not calling ourself from ISO-8859-X tag, then eat # leading newlines: if not recursive: _eat_all(f,'\n') c = f.read(1) # compile re for use in if then else. Matches 'iso-8859-XX' tags # where xx are digits. iso_re = re.compile(r'^iso-8859-([1-9][0-9]?)$', re.IGNORECASE) iso_close_re = re.compile(r'^/iso-8859-([1-9][0-9]?)$', re.IGNORECASE) while c != '': if c == '<': c, token = _read_token(f) if c == '': break if token == 'lt': g.write('<') just_closed_para = False elif token == 'nl': g.write('\n') # Discard all 'soft newlines' following token: _eat_all(f,'\n') elif token == 'np': g.write('\n\n\n') # Discard all 'soft newlines' following token: _eat_all(f,'\n') just_closed_para = True elif token == 'paragraph': # If we haven't just closed a paragraph tag, or done # equivalent (eg. output an tag) then produce # newlines to offset paragraph: if not just_closed_para: g.write('\n\n') elif token == '/paragraph': g.write('\n\n') # Discard all 'soft newlines' following token: _eat_all(f,'\n') just_closed_para = True elif token == 'comment': commct=1 while commct > 0: c = _throw_away_until(f,'<') # Bin characters until we get a '<' if c == '': break c, token = _read_token(f) if c == '': break if token == '/comment': commct -= 1 elif token == 'comment': commct += 1 elif iso_re.match(token): if not convert_iso_8859_tags: if not force_conversion: raise ISO8859TagError(" tag found when convert_iso_8859_tags=False") else: pass else: # Read in from the input file, stopping to look at # each tag. Keep reading until we have a balanced pair # of tags. Use tag_balance # to keep track of how many open iso-8859 tags we # have, since nesting is legal. When tag_balance hits # 0 we have found a balanced pair. tag_balance = 1 iso_str = '' while tag_balance != 0: c, next_str = _read_to_next_token(f) iso_str += next_str if c == '': break c, next_token = _read_token(f) if c == '': break if next_token == token: tag_balance += 1 elif next_token == '/' + token: tag_balance -= 1 if tag_balance != 0: iso_str += ('<' + next_token + '>') # We now have a complete string of text in the # foreign charset in iso_str, so we call ourself # to process it. No need to consider return # value, since we pass g and all the output gets # written to this. _richtext2txt(iso_str, charset, convert_iso_8859_tags, force_conversion, True, just_closed_para, output_file=g) #^^^^ = recursive elif iso_close_re.match(token): if force_conversion: pass else: if convert_iso_8859_tags: raise ISO8859TagError("closing tag before opening tag") else: raise ISO8859TagError(" tag found when convert_iso_8859_tags=False") else: # Ignore unrecognized token. pass elif c == '\n': # Read in contiguous string of newlines and output them as # single space, unless we hit EOF, in which case output # nothing. _eat_all(f,'\n') if _next_char(f) == '': break # If we have just written a newline out, soft newlines # should do nothing: if _last_char(g) != '\n': g.write(' ') else: # We have a 'normal char' so just write it out: _unicode_write(g, c, charset, force_conversion) just_closed_para = False c = f.read(1) # Only output the terminating newline if we aren't being called # recursively. if not recursive: g.write('\n') return g.getvalue() def _read_token(f): """ Read in token from inside a markup tag. """ token = "" c = f.read(1) while c != '' and c!= '>': token += c c = f.read(1) token = token.lower() return c, token def _read_to_next_token(f): out = '' c = f.read(1) while c != '<' and c != '': out += c c = f.read(1) return c, out def _eat_all(f,d): """ Discard all characters from input stream f of type d until we hit a character that is not of type d. Return the most recent bit read from the file. """ got_char = False if _next_char(f) == d: got_char = True while _next_char(f) == d: f.read(1) if got_char: return d else: return None def _throw_away_until(f,d): """ Discard all characters from input stream f until we hit a character of type d. Discard this char also. Return the most recent bit read from the file (which will either be d or EOF). """ c = f.read(1) while c != d and c != '': c = f.read(1) return c def _next_char(f): """ Return the next char in the file. """ # Get the char: c = f.read(1) # If it wasn't an EOF, backup one, otherwise stay put: if c != '': f.seek(-1,1) return c def _last_char(g): """ Look at what the last character written to a file was. """ pos = g.tell() if pos == 0: # At the start of the file. return None else: # Written at least one character, so step back one and read it # off. g.seek(-1,1) return g.read(1) def _unicode_write(g, string, charset, force_conversion): strictness = { True : 'strict', False: 'replace'}[force_conversion] # Could raise a UnicodeDecodingError! unicode_str = unicode(string, charset, strictness) g.write(unicode_str) class RichTextConversionError(Exception): """ An emtpy parent class for all errors in this module. """ pass class ISO8859TagError(RichTextConversionError): """ This error is raised when we are doing a conversion with strict=True, the input string is unicode and we get an iso-8859-x tag. Unicode should not contain mixed charsets. """ pass # The original C code direct from RFC1341, appendix D # See: http://www.faqs.org/rfcs/rfc1341.html # #include # #include # main() { # int c, i; # char token[50]; # while((c = getc(stdin)) != EOF) { # if (c == '<') { # for (i=0; (i<49 && (c = getc(stdin)) != '>' && c != EOF); ++i) { # token[i] = isupper(c) ? tolower(c) : c; # } # if (c == EOF) break; # if (c != '>') while ((c = getc(stdin)) != '>' && c != EOF) {;} # if (c == EOF) break; # token[i] = '\0'; # if (!strcmp(token, "lt")) { # putc('<', stdout); # } else if (!strcmp(token, "nl")) { # putc('\n', stdout); # } else if (!strcmp(token, "/paragraph")) { # fputs("\n\n", stdout); # } else if (!strcmp(token, "comment")) { # int commct=1; # while (commct > 0) { # while ((c = getc(stdin)) != '<' # && c != EOF) ; # if (c == EOF) break; # for (i=0; (c = getc(stdin)) != '>' # && c != EOF; ++i) { # token[i] = isupper(c) ? # tolower(c) : c; # } # if (c== EOF) break; # token[i] = NULL; # if (!strcmp(token, "/comment")) --commct; # if (!strcmp(token, "comment")) ++commct; # } # } /* Ignore all other tokens */ # } else if (c != '\n') putc(c, stdout); # } # putc('\n', stdout); /* for good measure */ # } # data = open('sample.rtx','r') # t = data.read() - + diff --git a/modules/elmsubmit/lib/elmsubmit_richtext2txt.py.wml b/modules/elmsubmit/lib/elmsubmit_richtext2txt.py.wml deleted file mode 100644 index 4f12985f5..000000000 --- a/modules/elmsubmit/lib/elmsubmit_richtext2txt.py.wml +++ /dev/null @@ -1,478 +0,0 @@ -# -*- coding: utf-8 -*- - -## $Id$ - -## This file is part of the CERN Document Server Software (CDSware). -## Copyright (C) 2002, 2003, 2004, 2005 CERN. -## -## The CDSware is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License as -## published by the Free Software Foundation; either version 2 of the -## License, or (at your option) any later version. -## -## The CDSware is distributed in the hope that it will be useful, but -## WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -## General Public License for more details. -## -## You should have received a copy of the GNU General Public License -## along with CDSware; if not, write to the Free Software Foundation, Inc., -## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - -""" -A text/richtext to text/plain converter. - -Always returns a unicode string. - -This is a module exporting a single function 'richtext2txt' which takes -a string of 'enriched text' and returns its conversion to 'plain -text'. 'rich text' is the text format as specified in RFC1341 for -use as an email payload with mime type text/richtext. - -The code is based on the example parser given in appendix D of -RFC1341. It is a quite heavily modified version; the new code (aside -from being in Python not C): - -1. Takes account of the tag. - -2. Deals better with soft newlines. - -3. Deals better with the paragraph tag. - -4. Takes account of the tag. - -The resulting code is something of a mishmash of the functional style -of programming that I prefer and the 'big while loop' proceedural -style in which the original C code is written. - -With reference to point 4: Richtext is a pain because it allows - markup tags to change charsets inside a -document. This means that if we get a text/richtext email payload -with 'Content-type' header specifying a charset e.g. 'us-ascii', we -can't simply decode to a unicode object; it is possible that bytes -inside the will break the -unicode(str,'us-ascii') function call! - -This is frustrating because: - -1. Why bother to have a charset declaration outside a document only to - go and break it inside? - -This might be understandable if text/richtext was designed -independantly of MIME and its Content-Type declarations but: - -2. text/richtext is specified in the SAME RFC as the Content-type: - MIME header! - -In fairness to the RFC writer(s), they were working at a time when -unicode/iso10646 was still in flux and so it was common for people -writing bilingual texts to want to use two charsets in one -document. It is interesting to note that the later text/enriched -specification (written when unicode had petrified) removes the -possibility of charset switching. - -The existence of tags makes the parser rather more -complicated. - -Treatment notes: - -> Second, the command "" is used to represent a required -> line break. (Otherwise, CRLFs in the data are treated as -> equivalent to a single SPACE character.) - -2. - -The RFC doesn't say to treat spaces as a special character; ie. that -they should be reproduced verbatim. This leads to the odd effect that -a string such as follows (where $SPACE$ in reality would be a space -character): - -"Some text...$SPACE$More text..." - -Is rendered as: - -"Some text... - -$SPACE$ - -More text..." - -ie. The space is considered a string of text which must be separated -from the displayed paragraphs. This seems fairly odd behaviour to me, -but the RFC seems to suggest this is correct treatment. -""" - -import re -import StringIO - -def richtext2txt(str, charset='us-ascii', convert_iso_8859_tags=False, force_conversion=False): - return _richtext2txt(str, charset, convert_iso_8859_tags, force_conversion) - -""" -Document options somewhere here. - -##### 5. Make a note that the parsers assume \n not CRLF conventions so preconvert!!! -##### ------------------------------------------------------------------------------- - -""" - -def _richtext2txt(string, charset='us-ascii', convert_iso_8859_tags=False, force_conversion=False, - recursive=False, just_closed_para=True, output_file=None): - - if type(string) == unicode and convert_iso_8859_tags: - - # Doesn't make sense to have a unicode string - # containing mixed charsets. - raise ValueError("function richtext2txt cannot have both unicode input string and convert_iso_8859_tags=True.") - - # f and g will be our input/output streams. - - # Create file like object from string for input file. - f = StringIO.StringIO(string) - - # Create another file like object from string for output file, - # unless we have been handed one by recursive call. - - if output_file is None: - g = StringIO.StringIO(u'') - else: - g = output_file - - # When comparing to the RFC1341 code, substitute: - # STDIN -> object f - # STDOUT -> object g - # EOF -> '' - # ungetc -> seek(-1,1) - - # If we're not calling ourself from ISO-8859-X tag, then eat - # leading newlines: - - if not recursive: _eat_all(f,'\n') - - c = f.read(1) - - # compile re for use in if then else. Matches 'iso-8859-XX' tags - # where xx are digits. - iso_re = re.compile(r'^iso-8859-([1-9][0-9]?)$', re.IGNORECASE) - iso_close_re = re.compile(r'^/iso-8859-([1-9][0-9]?)$', re.IGNORECASE) - - while c != '': - if c == '<': - - c, token = _read_token(f) - - if c == '': break - - if token == 'lt': - g.write('<') - - just_closed_para = False - elif token == 'nl': - - g.write('\n') - - # Discard all 'soft newlines' following token: - _eat_all(f,'\n') - - elif token == 'np': - - g.write('\n\n\n') - - # Discard all 'soft newlines' following token: - _eat_all(f,'\n') - - just_closed_para = True - - elif token == 'paragraph': - - # If we haven't just closed a paragraph tag, or done - # equivalent (eg. output an tag) then produce - # newlines to offset paragraph: - - if not just_closed_para: g.write('\n\n') - - elif token == '/paragraph': - g.write('\n\n') - - # Discard all 'soft newlines' following token: - _eat_all(f,'\n') - - just_closed_para = True - - elif token == 'comment': - commct=1 - - while commct > 0: - - c = _throw_away_until(f,'<') # Bin characters until we get a '<' - - if c == '': break - - c, token = _read_token(f) - - if c == '': break - - if token == '/comment': - commct -= 1 - elif token == 'comment': - commct += 1 - - elif iso_re.match(token): - - if not convert_iso_8859_tags: - if not force_conversion: - raise ISO8859TagError(" tag found when convert_iso_8859_tags=False") - else: - pass - else: - # Read in from the input file, stopping to look at - # each tag. Keep reading until we have a balanced pair - # of tags. Use tag_balance - # to keep track of how many open iso-8859 tags we - # have, since nesting is legal. When tag_balance hits - # 0 we have found a balanced pair. - - tag_balance = 1 - iso_str = '' - - while tag_balance != 0: - - c, next_str = _read_to_next_token(f) - - iso_str += next_str - - if c == '': break - - c, next_token = _read_token(f) - - if c == '': break - - if next_token == token: - tag_balance += 1 - elif next_token == '/' + token: - tag_balance -= 1 - - if tag_balance != 0: - iso_str += ('<' + next_token + '>') - - # We now have a complete string of text in the - # foreign charset in iso_str, so we call ourself - # to process it. No need to consider return - # value, since we pass g and all the output gets - # written to this. - - _richtext2txt(iso_str, charset, convert_iso_8859_tags, force_conversion, - True, just_closed_para, output_file=g) - #^^^^ = recursive - - elif iso_close_re.match(token): - - if force_conversion: - pass - else: - if convert_iso_8859_tags: - raise ISO8859TagError("closing tag before opening tag") - else: - raise ISO8859TagError(" tag found when convert_iso_8859_tags=False") - else: - # Ignore unrecognized token. - pass - - elif c == '\n': - - # Read in contiguous string of newlines and output them as - # single space, unless we hit EOF, in which case output - # nothing. - - _eat_all(f,'\n') - - if _next_char(f) == '': break - - # If we have just written a newline out, soft newlines - # should do nothing: - if _last_char(g) != '\n': g.write(' ') - - else: - # We have a 'normal char' so just write it out: - _unicode_write(g, c, charset, force_conversion) - - just_closed_para = False - - c = f.read(1) - - # Only output the terminating newline if we aren't being called - # recursively. - if not recursive: - g.write('\n') - - return g.getvalue() - -def _read_token(f): - """ - Read in token from inside a markup tag. - """ - - token = "" - - c = f.read(1) - - while c != '' and c!= '>': - token += c - c = f.read(1) - - token = token.lower() - - return c, token - -def _read_to_next_token(f): - - out = '' - - c = f.read(1) - while c != '<' and c != '': - out += c - c = f.read(1) - - return c, out - -def _eat_all(f,d): - - """ - Discard all characters from input stream f of type d until we hit - a character that is not of type d. Return the most recent bit read - from the file. - """ - - got_char = False - - if _next_char(f) == d: got_char = True - - while _next_char(f) == d: f.read(1) - - if got_char: - return d - else: - return None - -def _throw_away_until(f,d): - """ - Discard all characters from input stream f until we hit a - character of type d. Discard this char also. Return the most - recent bit read from the file (which will either be d or EOF). - """ - - c = f.read(1) - while c != d and c != '': c = f.read(1) - - return c - -def _next_char(f): - """ - Return the next char in the file. - """ - - # Get the char: - c = f.read(1) - - # If it wasn't an EOF, backup one, otherwise stay put: - if c != '': f.seek(-1,1) - - return c - -def _last_char(g): - """ - Look at what the last character written to a file was. - """ - - pos = g.tell() - - if pos == 0: - # At the start of the file. - return None - else: - # Written at least one character, so step back one and read it - # off. - g.seek(-1,1) - return g.read(1) - -def _unicode_write(g, string, charset, force_conversion): - - strictness = { True : 'strict', - False: 'replace'}[force_conversion] - - # Could raise a UnicodeDecodingError! - unicode_str = unicode(string, charset, strictness) - - g.write(unicode_str) - -class RichTextConversionError(Exception): - - """ - An emtpy parent class for all errors in this module. - """ - - pass - -class ISO8859TagError(RichTextConversionError): - - """ - This error is raised when we are doing a conversion with - strict=True, the input string is unicode and we get an iso-8859-x - tag. Unicode should not contain mixed charsets. - """ - - pass - -# The original C code direct from RFC1341, appendix D -# See: http://www.faqs.org/rfcs/rfc1341.html - -# #include -# #include -# main() { -# int c, i; -# char token[50]; - -# while((c = getc(stdin)) != EOF) { -# if (c == '<') { -# for (i=0; (i<49 && (c = getc(stdin)) != '>' && c != EOF); ++i) { -# token[i] = isupper(c) ? tolower(c) : c; -# } -# if (c == EOF) break; -# if (c != '>') while ((c = getc(stdin)) != '>' && c != EOF) {;} -# if (c == EOF) break; -# token[i] = '\0'; -# if (!strcmp(token, "lt")) { -# putc('<', stdout); -# } else if (!strcmp(token, "nl")) { -# putc('\n', stdout); -# } else if (!strcmp(token, "/paragraph")) { -# fputs("\n\n", stdout); -# } else if (!strcmp(token, "comment")) { -# int commct=1; -# while (commct > 0) { -# while ((c = getc(stdin)) != '<' -# && c != EOF) ; -# if (c == EOF) break; -# for (i=0; (c = getc(stdin)) != '>' -# && c != EOF; ++i) { -# token[i] = isupper(c) ? -# tolower(c) : c; -# } -# if (c== EOF) break; -# token[i] = NULL; -# if (!strcmp(token, "/comment")) --commct; -# if (!strcmp(token, "comment")) ++commct; -# } -# } /* Ignore all other tokens */ -# } else if (c != '\n') putc(c, stdout); -# } -# putc('\n', stdout); /* for good measure */ -# } - -# data = open('sample.rtx','r') -# t = data.read() - - diff --git a/modules/elmsubmit/lib/elmsubmit_submission_parser.py b/modules/elmsubmit/lib/elmsubmit_submission_parser.py index 13091c379..ec3ecf32b 100644 --- a/modules/elmsubmit/lib/elmsubmit_submission_parser.py +++ b/modules/elmsubmit/lib/elmsubmit_submission_parser.py @@ -1,231 +1,226 @@ -# -*- coding: utf-8 -*- - -## $Id$ - +# -*- coding: utf-8 -*- +## +## $Id$ +## ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - - """ parse_submission takes text like this: -------------- cdson::: _language_ eng _type_ test _title_ Blathering on About More Scientific Crap _author_ Owen, R _num_ 69 _date_ 01/01/2004 _keywords_ science forgery vitriol _abstract_ This is possibly the best document to come out of space in a long time. Aliens have really improved their scientific writing abilites. Brillig things about this document: One: I wrote it. Two: It smells of cheese. Three: Fnah! _note_ Not musical, but informative. _refnums_ AA11x-madeup_ref _files_ info.pdf foo.txt cdsoff::: Dear Sir, Here is the rest of the email. Sincerely, Jonathon Bloggs. -- Tel: 555 111 234567 IT-UDS CERN -------------- This is turned into a 2-tuple. The first entry in the tuple is a Python dictionary containing the submission info. The second entry is the trailing text that follows the submission (the submission MUST be at the top of the text). ({'abstract': 'This is possibly the best document to come out of space in a long\ntime. Aliens have really improved their scientific writing abilites.\n\nBrillig things about this document:\nOne: I wrote it.\nTwo: It smells of cheese.\nThree: Fnah!', 'author': 'Owen, R', 'date': '01/01/2004', 'files': 'info.pdf\nfoo.txt', 'keywords': 'science forgery vitriol', 'language': 'eng', 'note': 'Not musical, but informative.', 'num': '69', 'refnums': 'AA11x-madeup_ref', 'title': 'Blathering on About More Scientific Crap', 'type': 'test'}, '\nDear Sir,\n\nHere is the rest of the email.\n\nSincerely, \nJonathon Bloggs.\n\n--\nTel: 555 111 234567\nIT-UDS CERN') It is fairly robust when treating misformatted submissions, so should hopefully protect against misformatting due to evil smtp servers / conversion from HTML email producing clients, etc. For example, we can process the following OK: ---------------- cdson::: _language_ eng _type_ test _title_ Blathering on About More Scientific Crap __author__ Owen, R _num_ 69 _date_ 01/01/2004 _keywords_ science forgery vitriol cdsoff::: --------------------- """ import re tokens = ( 'CDSON', 'CDSOFF', 'KEY', 'VALUE', ) # Tokens def t_CDSON(t): r'\s*cdson:::\n+' return t def t_KEY(t): r'(?<=\n)[\ \t]*_+\w+?_+\s*\n+' t.value = re.search(r'_+(\w+?)_+', t.value).group(1) t.value = t.value.lower() return t def t_VALUE(t): r'.+?\S+.*?(?=([\ \t]*_+\w+?_+\s*\n|\n\s*cdsoff:::))' t.value = t.value.strip() return t def t_CDSOFF(t): r'(?s)\n\s*cdsoff:::(\n.*)?' match = re.search(r'(?s)\n\s*cdsoff:::(\n.*)?', t.value) global trailing_text if match.group(1) is not None: # [1:] kills the extra newline we matched: trailing_text = match.group(1)[1:] else: trailing_text = '' return t def t_error(t): print "Illegal character '%s'" % t.value[0] raise ValueError('bad parsing') # Build the lexer import lex lex.lex(optimize=1) # Parsing rules # Dictionary: data = {} def p_submission(p): """submission : CDSON assignmentList CDSOFF""" def p_assignmentList(p): """assignmentList : assignment | assignment assignmentList""" def p_assignment(p): """assignment : KEY VALUE""" data[p[1]] = p[2] def p_error(p): print "Syntax error at '%s'" % p.value raise ValueError('syntax error') import yacc yacc.yacc() def parse_submission(string): global data global trailing_text try: try: yacc.parse(string) return (data, trailing_text) except: raise SubmissionParserError() finally: data = {} trailing_text = '' class SubmissionParserError(Exception): pass - diff --git a/modules/elmsubmit/lib/elmsubmit_submission_parser.py.wml b/modules/elmsubmit/lib/elmsubmit_submission_parser.py.wml deleted file mode 100644 index 13091c379..000000000 --- a/modules/elmsubmit/lib/elmsubmit_submission_parser.py.wml +++ /dev/null @@ -1,231 +0,0 @@ -# -*- coding: utf-8 -*- - -## $Id$ - -## This file is part of the CERN Document Server Software (CDSware). -## Copyright (C) 2002, 2003, 2004, 2005 CERN. -## -## The CDSware is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License as -## published by the Free Software Foundation; either version 2 of the -## License, or (at your option) any later version. -## -## The CDSware is distributed in the hope that it will be useful, but -## WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -## General Public License for more details. -## -## You should have received a copy of the GNU General Public License -## along with CDSware; if not, write to the Free Software Foundation, Inc., -## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - - -""" -parse_submission takes text like this: - --------------- -cdson::: - -_language_ -eng - -_type_ -test - -_title_ -Blathering on About More Scientific Crap - -_author_ -Owen, R - -_num_ -69 - -_date_ -01/01/2004 - -_keywords_ -science forgery vitriol - -_abstract_ -This is possibly the best document to come out of space in a long -time. Aliens have really improved their scientific writing abilites. - -Brillig things about this document: -One: I wrote it. -Two: It smells of cheese. -Three: Fnah! - -_note_ -Not musical, but informative. - -_refnums_ -AA11x-madeup_ref - -_files_ -info.pdf -foo.txt - -cdsoff::: - -Dear Sir, - -Here is the rest of the email. - -Sincerely, -Jonathon Bloggs. - --- -Tel: 555 111 234567 -IT-UDS CERN --------------- - -This is turned into a 2-tuple. The first entry in the tuple is a -Python dictionary containing the submission info. The second entry is -the trailing text that follows the submission (the submission MUST be -at the top of the text). - -({'abstract': 'This is possibly the best document to come out of space in a long\ntime. Aliens have really improved their scientific writing abilites.\n\nBrillig things about this document:\nOne: I wrote it.\nTwo: It smells of cheese.\nThree: Fnah!', - 'author': 'Owen, R', - 'date': '01/01/2004', - 'files': 'info.pdf\nfoo.txt', - 'keywords': 'science forgery vitriol', - 'language': 'eng', - 'note': 'Not musical, but informative.', - 'num': '69', - 'refnums': 'AA11x-madeup_ref', - 'title': 'Blathering on About More Scientific Crap', - 'type': 'test'}, - '\nDear Sir,\n\nHere is the rest of the email.\n\nSincerely, \nJonathon Bloggs.\n\n--\nTel: 555 111 234567\nIT-UDS CERN') - -It is fairly robust when treating misformatted submissions, so should -hopefully protect against misformatting due to evil smtp servers / -conversion from HTML email producing clients, etc. For example, we can -process the following OK: - ----------------- - cdson::: - - _language_ - eng - - _type_ - - - - test - -_title_ - Blathering on About More Scientific Crap - -__author__ - Owen, R - - _num_ - 69 - - _date_ - - - - - - 01/01/2004 - - _keywords_ - science forgery vitriol - cdsoff::: - ---------------------- - -""" - - - -import re - -tokens = ( - 'CDSON', - 'CDSOFF', - 'KEY', - 'VALUE', - ) - -# Tokens -def t_CDSON(t): - r'\s*cdson:::\n+' - return t - -def t_KEY(t): - r'(?<=\n)[\ \t]*_+\w+?_+\s*\n+' - t.value = re.search(r'_+(\w+?)_+', t.value).group(1) - t.value = t.value.lower() - return t - -def t_VALUE(t): - r'.+?\S+.*?(?=([\ \t]*_+\w+?_+\s*\n|\n\s*cdsoff:::))' - t.value = t.value.strip() - return t - -def t_CDSOFF(t): - r'(?s)\n\s*cdsoff:::(\n.*)?' - match = re.search(r'(?s)\n\s*cdsoff:::(\n.*)?', t.value) - global trailing_text - if match.group(1) is not None: - # [1:] kills the extra newline we matched: - trailing_text = match.group(1)[1:] - else: - trailing_text = '' - return t - -def t_error(t): - print "Illegal character '%s'" % t.value[0] - raise ValueError('bad parsing') - -# Build the lexer -import lex -lex.lex(optimize=1) - -# Parsing rules - -# Dictionary: -data = {} - -def p_submission(p): - """submission : CDSON assignmentList CDSOFF""" - -def p_assignmentList(p): - """assignmentList : assignment - | assignment assignmentList""" - -def p_assignment(p): - """assignment : KEY VALUE""" - data[p[1]] = p[2] - -def p_error(p): - print "Syntax error at '%s'" % p.value - raise ValueError('syntax error') - -import yacc -yacc.yacc() - -def parse_submission(string): - global data - global trailing_text - try: - try: - yacc.parse(string) - return (data, trailing_text) - except: - raise SubmissionParserError() - finally: - data = {} - trailing_text = '' - -class SubmissionParserError(Exception): - pass - - diff --git a/modules/elmsubmit/lib/encdet.py b/modules/elmsubmit/lib/encdet.py index 7c8f71af9..6bae49260 100644 --- a/modules/elmsubmit/lib/encdet.py +++ b/modules/elmsubmit/lib/encdet.py @@ -1,342 +1,342 @@ -#!/usr/bin/env python +#!/usr/bin/env python # -*- encoding: japanese.ms932 -*- # encdet.py - An encoding detector # by Yusuke Shinyama # * public domain * import sys, re ## EncodingRecognizer ## - a finite automaton which receives octets ## class EncodingRecognizer: SCORE_DEFAULT = 0.5 DEATH_PENALTY = -100 GIVEUP_THRESHOLD = -1000 # character sets: must be exclusive! CHARSET = [ # zenkaku-kana (1.5, re.compile(u"[‚Ÿ-‚ñ]"), 0x01), (1.5, re.compile(u"[ƒ@-ƒ”]"), 0x02), (1.0, re.compile(u"[[RSTU]"), 0x03), # hankaku latin (1.2, re.compile(u"[a-zA-Z0-9]"), 0x04), (0.0, re.compile(u"[\u00c0-\u00ff]"), 0x04), # hankaku-kana (0.8, re.compile(u"[\uff66-\uff9d]"), 0x08), # zenkaku-alphanum (1.2, re.compile(u"[‚`-‚y‚-‚š‚O-‚X]"), 0x10), # kanji (1.0, re.compile(u"[\u4e00-\u9fff]"), 0x20), ] def __init__(self, encoding): self.encoding = encoding self.ch = "" self.state = 1 self.partial_score = 0.0 self.total_score = 0.0 self.chunk_type = 0 return def __repr__(self): return "" % \ (self.encoding, self.state, self.chunk_type, self.partial_score, self.total_score) def die(self): #print "died:", self self.total_score += self.DEATH_PENALTY if self.total_score <= self.GIVEUP_THRESHOLD: # game is over... #print "giveup:", self self.state = 0 else: # try again... self.state = 1 self.partial_score = 0 self.ch = "" return def flush(self): self.total_score += self.partial_score * self.partial_score self.partial_score = 0.0 return def accept(self, s): try: c = unicode(s, self.encoding) except UnicodeError: c = "" for (score, pat, flags) in self.CHARSET: if pat.match(c): if self.chunk_type == 0 or not (self.chunk_type & flags): self.flush() self.chunk_type = flags self.partial_score += score break else: self.flush() self.chunk_type = 0 self.partial_score += self.SCORE_DEFAULT return def finish(self): self.flush() if 1 < self.state: self.die() return ## CHARACTER SETS ## ISO-8859-* ## class ISO8859_Recognizer(EncodingRecognizer): def __init__(self): return EncodingRecognizer.__init__(self, "iso8859_1") def feed(self, c): if self.state == 0: # already dead? return elif self.state == 1: # ascii or iso? if c < 0x7f or (0xa0 <= c and c <= 0xff): self.state = 1 self.accept(chr(c)) else: self.die() return ## EUC-JP ## class EUCJP_Recognizer(EncodingRecognizer): def __init__(self): self.hankaku = False return EncodingRecognizer.__init__(self, "japanese.euc_jp") def feed(self, c): if self.state == 0: # already dead? return # 1stbyte elif self.state == 1: if c < 0x7f: # ascii? # succeed self.state = 1 self.accept(chr(c)) self.ch = "" # IGNORE EUC-JP hankaku chars, no one is using # elif 0x8e == c: # hankaku-kana 1stbyte? # # next # self.state = 2 # self.ch = chr(c) # self.hankaku = True elif 0xa1 <= c and c <= 0xfe: # kanji 1stbyte? # next self.state = 2 self.ch = chr(c) self.hankaku = False else: self.die() # 2ndbyte elif self.state == 2: if self.hankaku and (0xa1 <= c and c <= 0xdf): # hankaku-kana 2ndbyte? # succeed self.ch += chr(c) self.accept(self.ch) self.state = 1 self.ch = "" elif not self.hankaku and (0xa1 <= c and c <= 0xfe): # kanji 2ndbyte? # succeed self.ch += chr(c) self.accept(self.ch) self.state = 1 self.ch = "" else: self.die() return ## CP932 ## class CP932_Recognizer(EncodingRecognizer): def __init__(self): return EncodingRecognizer.__init__(self, "japanese.ms932") def feed(self, c): if self.state == 0: # already dead? return # 1stbyte elif self.state == 1: if c < 0x7f: # ascii? # succeed self.state = 1 self.accept(chr(c)) self.ch = "" elif 0xa1 <= c and c <= 0xdf: # hankaku-kana? # succeed self.state = 1 self.accept(chr(c)) self.ch = "" elif (0x81 <= c and c <= 0x9f) or (0xe0 <= c and c <= 0xee) \ or (0xfa <= c and c <= 0xfc): # kanji 1stbyte? # next self.state = 2 self.ch = chr(c) else: self.die() # 2ndbyte elif self.state == 2: if 0x40 <= c and c <= 0xfc and c != 0x7f: # kanji 2ndbyte? # succeed self.accept(self.ch+chr(c)) self.state = 1 self.ch = "" else: self.die() return ## UTF-8 ## class UTF8_Recognizer(EncodingRecognizer): def __init__(self): self.left = 0 return EncodingRecognizer.__init__(self, "utf8") def feed(self, c): if self.state == 0: # already dead? return # 1stbyte elif self.state == 1: if c <= 0x7f: # 00xxxxxx: 1byte only? # succeed self.state = 1 self.accept(chr(c)) self.ch = "" elif c & 0xe0 == 0xc0: # 110xxxxx: 2bytes # next self.state = 2 self.left = 1 self.ch = chr(c) elif c & 0xf0 == 0xe0: # 1110xxxx: 3bytes # next self.state = 2 self.left = 2 self.ch = chr(c) elif c & 0xf8 == 0xf0: # 11110xxx: 4bytes # next self.state = 2 self.left = 3 self.ch = chr(c) elif c & 0xfc == 0xf8: # 111110xx: 5bytes # next self.state = 2 self.left = 4 self.ch = chr(c) else: self.die() # n-th byte (where 2<=n) else: if c & 0xc0 == 0x80: # 10xxxxxx: continuous? self.state += 1 self.left -= 1 self.ch += chr(c) if self.left == 0: # finished? # succeed self.state = 1 self.accept(self.ch) self.ch = "" else: # next pass else: self.die() return # guess def guess(s): recognizer = [ EUCJP_Recognizer(), CP932_Recognizer(), ISO8859_Recognizer(), UTF8_Recognizer() ] for c in s: for r in recognizer: r.feed(ord(c)) for r in recognizer: r.finish() #print r recognizer.sort(lambda a,b: cmp(b.total_score, a.total_score)) return recognizer[0].encoding # test suite def test(s0, test_encodings): false_encodings = [ "japanese.euc_jp", "japanese.ms932", "utf8", "iso8859_1" ] for enc1 in test_encodings: try: s = s0.encode(enc1) except UnicodeError: continue print "try '%s' in %s (%s)" % (s0, enc1, " ".join(map(lambda c:"%02x" % ord(c), s))) for enc2 in false_encodings: if enc1 != enc2: try: x = str(unicode(s, enc2)) print " (could be: '%s' in %s)" % (x, enc2) except UnicodeError: continue genc = guess(s) if genc == enc1: print " CORRECT:", genc else: print " ! INCORRECT:", genc print return def test_suite(): # kana only test(u"‚±‚ñ‚É‚¿‚Í", ["japanese.euc_jp", "japanese.ms932", "utf8"]) # kana + alphanum test(u"A‚ÍB‚ÆC‚Å‚ ‚é", ["japanese.euc_jp", "japanese.ms932", "utf8"]) # kana + kanji test(u"–ˆ’©V•·ƒjƒ…[ƒX", ["japanese.euc_jp", "japanese.ms932", "utf8"]) # kanji + hankakukana test(u"–³‘èÄÞ·­ÒÝÄ", ["japanese.ms932", "utf8"]) # iso8859-1 test(u"Enzyklop\u00e4die", ["utf8", "iso8859_1"]) return # main test_suite(); sys.exit(0) if __name__ == "__main__": import fileinput for s in fileinput.input(): - print guess(s) + print guess(s) diff --git a/modules/elmsubmit/lib/encdet.py.wml b/modules/elmsubmit/lib/encdet.py.wml deleted file mode 100644 index 7c8f71af9..000000000 --- a/modules/elmsubmit/lib/encdet.py.wml +++ /dev/null @@ -1,342 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: japanese.ms932 -*- - -# encdet.py - An encoding detector -# by Yusuke Shinyama -# * public domain * - -import sys, re - - -## EncodingRecognizer -## - a finite automaton which receives octets -## -class EncodingRecognizer: - - SCORE_DEFAULT = 0.5 - DEATH_PENALTY = -100 - GIVEUP_THRESHOLD = -1000 - - # character sets: must be exclusive! - CHARSET = [ - # zenkaku-kana - (1.5, re.compile(u"[‚Ÿ-‚ñ]"), 0x01), - (1.5, re.compile(u"[ƒ@-ƒ”]"), 0x02), - (1.0, re.compile(u"[[RSTU]"), 0x03), - - # hankaku latin - (1.2, re.compile(u"[a-zA-Z0-9]"), 0x04), - (0.0, re.compile(u"[\u00c0-\u00ff]"), 0x04), - - # hankaku-kana - (0.8, re.compile(u"[\uff66-\uff9d]"), 0x08), - - # zenkaku-alphanum - (1.2, re.compile(u"[‚`-‚y‚-‚š‚O-‚X]"), 0x10), - - # kanji - (1.0, re.compile(u"[\u4e00-\u9fff]"), 0x20), - - ] - - def __init__(self, encoding): - self.encoding = encoding - self.ch = "" - self.state = 1 - self.partial_score = 0.0 - self.total_score = 0.0 - self.chunk_type = 0 - return - - def __repr__(self): - return "" % \ - (self.encoding, self.state, self.chunk_type, self.partial_score, self.total_score) - - def die(self): - #print "died:", self - self.total_score += self.DEATH_PENALTY - if self.total_score <= self.GIVEUP_THRESHOLD: - # game is over... - #print "giveup:", self - self.state = 0 - else: - # try again... - self.state = 1 - self.partial_score = 0 - self.ch = "" - return - - def flush(self): - self.total_score += self.partial_score * self.partial_score - self.partial_score = 0.0 - return - - def accept(self, s): - try: - c = unicode(s, self.encoding) - except UnicodeError: - c = "" - for (score, pat, flags) in self.CHARSET: - if pat.match(c): - if self.chunk_type == 0 or not (self.chunk_type & flags): - self.flush() - self.chunk_type = flags - self.partial_score += score - break - else: - self.flush() - self.chunk_type = 0 - self.partial_score += self.SCORE_DEFAULT - return - - def finish(self): - self.flush() - if 1 < self.state: - self.die() - return - - -## CHARACTER SETS - - -## ISO-8859-* -## -class ISO8859_Recognizer(EncodingRecognizer): - - def __init__(self): - return EncodingRecognizer.__init__(self, "iso8859_1") - - def feed(self, c): - if self.state == 0: # already dead? - return - - elif self.state == 1: # ascii or iso? - if c < 0x7f or (0xa0 <= c and c <= 0xff): - self.state = 1 - self.accept(chr(c)) - - else: - self.die() - - return - - -## EUC-JP -## -class EUCJP_Recognizer(EncodingRecognizer): - - def __init__(self): - self.hankaku = False - return EncodingRecognizer.__init__(self, "japanese.euc_jp") - - def feed(self, c): - if self.state == 0: # already dead? - return - - # 1stbyte - elif self.state == 1: - if c < 0x7f: # ascii? - # succeed - self.state = 1 - self.accept(chr(c)) - self.ch = "" -# IGNORE EUC-JP hankaku chars, no one is using -# elif 0x8e == c: # hankaku-kana 1stbyte? -# # next -# self.state = 2 -# self.ch = chr(c) -# self.hankaku = True - elif 0xa1 <= c and c <= 0xfe: # kanji 1stbyte? - # next - self.state = 2 - self.ch = chr(c) - self.hankaku = False - else: - self.die() - - # 2ndbyte - elif self.state == 2: - if self.hankaku and (0xa1 <= c and c <= 0xdf): # hankaku-kana 2ndbyte? - # succeed - self.ch += chr(c) - self.accept(self.ch) - self.state = 1 - self.ch = "" - elif not self.hankaku and (0xa1 <= c and c <= 0xfe): # kanji 2ndbyte? - # succeed - self.ch += chr(c) - self.accept(self.ch) - self.state = 1 - self.ch = "" - else: - self.die() - - return - - -## CP932 -## -class CP932_Recognizer(EncodingRecognizer): - - def __init__(self): - return EncodingRecognizer.__init__(self, "japanese.ms932") - - def feed(self, c): - if self.state == 0: # already dead? - return - - # 1stbyte - elif self.state == 1: - if c < 0x7f: # ascii? - # succeed - self.state = 1 - self.accept(chr(c)) - self.ch = "" - elif 0xa1 <= c and c <= 0xdf: # hankaku-kana? - # succeed - self.state = 1 - self.accept(chr(c)) - self.ch = "" - elif (0x81 <= c and c <= 0x9f) or (0xe0 <= c and c <= 0xee) \ - or (0xfa <= c and c <= 0xfc): # kanji 1stbyte? - # next - self.state = 2 - self.ch = chr(c) - else: - self.die() - - # 2ndbyte - elif self.state == 2: - if 0x40 <= c and c <= 0xfc and c != 0x7f: # kanji 2ndbyte? - # succeed - self.accept(self.ch+chr(c)) - self.state = 1 - self.ch = "" - else: - self.die() - - return - - -## UTF-8 -## -class UTF8_Recognizer(EncodingRecognizer): - - def __init__(self): - self.left = 0 - return EncodingRecognizer.__init__(self, "utf8") - - def feed(self, c): - if self.state == 0: # already dead? - return - - # 1stbyte - elif self.state == 1: - if c <= 0x7f: # 00xxxxxx: 1byte only? - # succeed - self.state = 1 - self.accept(chr(c)) - self.ch = "" - elif c & 0xe0 == 0xc0: # 110xxxxx: 2bytes - # next - self.state = 2 - self.left = 1 - self.ch = chr(c) - elif c & 0xf0 == 0xe0: # 1110xxxx: 3bytes - # next - self.state = 2 - self.left = 2 - self.ch = chr(c) - elif c & 0xf8 == 0xf0: # 11110xxx: 4bytes - # next - self.state = 2 - self.left = 3 - self.ch = chr(c) - elif c & 0xfc == 0xf8: # 111110xx: 5bytes - # next - self.state = 2 - self.left = 4 - self.ch = chr(c) - else: - self.die() - - # n-th byte (where 2<=n) - else: - if c & 0xc0 == 0x80: # 10xxxxxx: continuous? - self.state += 1 - self.left -= 1 - self.ch += chr(c) - if self.left == 0: # finished? - # succeed - self.state = 1 - self.accept(self.ch) - self.ch = "" - else: - # next - pass - else: - self.die() - - return - - -# guess -def guess(s): - recognizer = [ - EUCJP_Recognizer(), - CP932_Recognizer(), - ISO8859_Recognizer(), - UTF8_Recognizer() - ] - for c in s: - for r in recognizer: - r.feed(ord(c)) - for r in recognizer: - r.finish() - #print r - recognizer.sort(lambda a,b: cmp(b.total_score, a.total_score)) - return recognizer[0].encoding - -# test suite -def test(s0, test_encodings): - false_encodings = [ "japanese.euc_jp", "japanese.ms932", "utf8", "iso8859_1" ] - for enc1 in test_encodings: - try: - s = s0.encode(enc1) - except UnicodeError: - continue - print "try '%s' in %s (%s)" % (s0, enc1, " ".join(map(lambda c:"%02x" % ord(c), s))) - for enc2 in false_encodings: - if enc1 != enc2: - try: - x = str(unicode(s, enc2)) - print " (could be: '%s' in %s)" % (x, enc2) - except UnicodeError: - continue - genc = guess(s) - if genc == enc1: - print " CORRECT:", genc - else: - print " ! INCORRECT:", genc - print - return - -def test_suite(): - # kana only - test(u"‚±‚ñ‚É‚¿‚Í", ["japanese.euc_jp", "japanese.ms932", "utf8"]) - # kana + alphanum - test(u"A‚ÍB‚ÆC‚Å‚ ‚é", ["japanese.euc_jp", "japanese.ms932", "utf8"]) - # kana + kanji - test(u"–ˆ’©V•·ƒjƒ…[ƒX", ["japanese.euc_jp", "japanese.ms932", "utf8"]) - # kanji + hankakukana - test(u"–³‘èÄÞ·­ÒÝÄ", ["japanese.ms932", "utf8"]) - # iso8859-1 - test(u"Enzyklop\u00e4die", ["utf8", "iso8859_1"]) - return - -# main -test_suite(); sys.exit(0) -if __name__ == "__main__": - import fileinput - for s in fileinput.input(): - print guess(s) diff --git a/modules/elmsubmit/lib/encdet_utf8.py b/modules/elmsubmit/lib/encdet_utf8.py index e70810cb3..d2cab062e 100644 --- a/modules/elmsubmit/lib/encdet_utf8.py +++ b/modules/elmsubmit/lib/encdet_utf8.py @@ -1,344 +1,344 @@ -#!/usr/bin/env python +#!/usr/bin/env python # -*- encoding: utf8 -*- # Converted from the original japanese.ms932 encoding to utf8. # encdet.py - An encoding detector # by Yusuke Shinyama # * public domain * import sys, re ## EncodingRecognizer ## - a finite automaton which receives octets ## class EncodingRecognizer: SCORE_DEFAULT = 0.5 DEATH_PENALTY = -100 GIVEUP_THRESHOLD = -1000 # character sets: must be exclusive! CHARSET = [ # zenkaku-kana (1.5, re.compile(u"[ã-ã‚“]"), 0x01), (1.5, re.compile(u"[ã‚¡-ヴ]"), 0x02), (1.0, re.compile(u"[ーヽヾã‚ã‚ž]"), 0x03), # hankaku latin (1.2, re.compile(u"[a-zA-Z0-9]"), 0x04), (0.0, re.compile(u"[\u00c0-\u00ff]"), 0x04), # hankaku-kana (0.8, re.compile(u"[\uff66-\uff9d]"), 0x08), # zenkaku-alphanum (1.2, re.compile(u"[A-Zï½-zï¼-ï¼™]"), 0x10), # kanji (1.0, re.compile(u"[\u4e00-\u9fff]"), 0x20), ] def __init__(self, encoding): self.encoding = encoding self.ch = "" self.state = 1 self.partial_score = 0.0 self.total_score = 0.0 self.chunk_type = 0 return def __repr__(self): return "" % \ (self.encoding, self.state, self.chunk_type, self.partial_score, self.total_score) def die(self): #print "died:", self self.total_score += self.DEATH_PENALTY if self.total_score <= self.GIVEUP_THRESHOLD: # game is over... #print "giveup:", self self.state = 0 else: # try again... self.state = 1 self.partial_score = 0 self.ch = "" return def flush(self): self.total_score += self.partial_score * self.partial_score self.partial_score = 0.0 return def accept(self, s): try: c = unicode(s, self.encoding) except UnicodeError: c = "" for (score, pat, flags) in self.CHARSET: if pat.match(c): if self.chunk_type == 0 or not (self.chunk_type & flags): self.flush() self.chunk_type = flags self.partial_score += score break else: self.flush() self.chunk_type = 0 self.partial_score += self.SCORE_DEFAULT return def finish(self): self.flush() if 1 < self.state: self.die() return ## CHARACTER SETS ## ISO-8859-* ## class ISO8859_Recognizer(EncodingRecognizer): def __init__(self): return EncodingRecognizer.__init__(self, "iso8859_1") def feed(self, c): if self.state == 0: # already dead? return elif self.state == 1: # ascii or iso? if c < 0x7f or (0xa0 <= c and c <= 0xff): self.state = 1 self.accept(chr(c)) else: self.die() return ## EUC-JP ## class EUCJP_Recognizer(EncodingRecognizer): def __init__(self): self.hankaku = False return EncodingRecognizer.__init__(self, "japanese.euc_jp") def feed(self, c): if self.state == 0: # already dead? return # 1stbyte elif self.state == 1: if c < 0x7f: # ascii? # succeed self.state = 1 self.accept(chr(c)) self.ch = "" # IGNORE EUC-JP hankaku chars, no one is using # elif 0x8e == c: # hankaku-kana 1stbyte? # # next # self.state = 2 # self.ch = chr(c) # self.hankaku = True elif 0xa1 <= c and c <= 0xfe: # kanji 1stbyte? # next self.state = 2 self.ch = chr(c) self.hankaku = False else: self.die() # 2ndbyte elif self.state == 2: if self.hankaku and (0xa1 <= c and c <= 0xdf): # hankaku-kana 2ndbyte? # succeed self.ch += chr(c) self.accept(self.ch) self.state = 1 self.ch = "" elif not self.hankaku and (0xa1 <= c and c <= 0xfe): # kanji 2ndbyte? # succeed self.ch += chr(c) self.accept(self.ch) self.state = 1 self.ch = "" else: self.die() return ## CP932 ## class CP932_Recognizer(EncodingRecognizer): def __init__(self): return EncodingRecognizer.__init__(self, "japanese.ms932") def feed(self, c): if self.state == 0: # already dead? return # 1stbyte elif self.state == 1: if c < 0x7f: # ascii? # succeed self.state = 1 self.accept(chr(c)) self.ch = "" elif 0xa1 <= c and c <= 0xdf: # hankaku-kana? # succeed self.state = 1 self.accept(chr(c)) self.ch = "" elif (0x81 <= c and c <= 0x9f) or (0xe0 <= c and c <= 0xee) \ or (0xfa <= c and c <= 0xfc): # kanji 1stbyte? # next self.state = 2 self.ch = chr(c) else: self.die() # 2ndbyte elif self.state == 2: if 0x40 <= c and c <= 0xfc and c != 0x7f: # kanji 2ndbyte? # succeed self.accept(self.ch+chr(c)) self.state = 1 self.ch = "" else: self.die() return ## UTF-8 ## class UTF8_Recognizer(EncodingRecognizer): def __init__(self): self.left = 0 return EncodingRecognizer.__init__(self, "utf8") def feed(self, c): if self.state == 0: # already dead? return # 1stbyte elif self.state == 1: if c <= 0x7f: # 00xxxxxx: 1byte only? # succeed self.state = 1 self.accept(chr(c)) self.ch = "" elif c & 0xe0 == 0xc0: # 110xxxxx: 2bytes # next self.state = 2 self.left = 1 self.ch = chr(c) elif c & 0xf0 == 0xe0: # 1110xxxx: 3bytes # next self.state = 2 self.left = 2 self.ch = chr(c) elif c & 0xf8 == 0xf0: # 11110xxx: 4bytes # next self.state = 2 self.left = 3 self.ch = chr(c) elif c & 0xfc == 0xf8: # 111110xx: 5bytes # next self.state = 2 self.left = 4 self.ch = chr(c) else: self.die() # n-th byte (where 2<=n) else: if c & 0xc0 == 0x80: # 10xxxxxx: continuous? self.state += 1 self.left -= 1 self.ch += chr(c) if self.left == 0: # finished? # succeed self.state = 1 self.accept(self.ch) self.ch = "" else: # next pass else: self.die() return # guess def guess(s): recognizer = [ EUCJP_Recognizer(), CP932_Recognizer(), ISO8859_Recognizer(), UTF8_Recognizer() ] for c in s: for r in recognizer: r.feed(ord(c)) for r in recognizer: r.finish() #print r recognizer.sort(lambda a,b: cmp(b.total_score, a.total_score)) return recognizer[0].encoding # test suite def test(s0, test_encodings): false_encodings = [ "japanese.euc_jp", "japanese.ms932", "utf8", "iso8859_1" ] for enc1 in test_encodings: try: s = s0.encode(enc1) except UnicodeError: continue print "try '%s' in %s (%s)" % (s0.encode('utf8'), enc1.encode('utf8'), " ".join(map(lambda c:"%02x" % ord(c), s))) for enc2 in false_encodings: if enc1 != enc2: try: x = str(unicode(s, enc2)) print " (could be: '%s' in %s)" % (x, enc2) except UnicodeError: continue genc = guess(s) if genc == enc1: print " CORRECT:", genc else: print " ! INCORRECT:", genc print return def test_suite(): # kana only test(u"ã“ã‚“ã«ã¡ã¯", ["japanese.euc_jp", "japanese.ms932", "utf8"]) # kana + alphanum test(u"Aã¯Bã¨Cã§ã‚ã‚‹", ["japanese.euc_jp", "japanese.ms932", "utf8"]) # kana + kanji test(u"毎æœæ–°èžãƒ‹ãƒ¥ãƒ¼ã‚¹", ["japanese.euc_jp", "japanese.ms932", "utf8"]) # kanji + hankakukana test(u"無題ドキュメï¾ï¾„", ["japanese.ms932", "utf8"]) # iso8859-1 test(u"Enzyklop\u00e4die", ["utf8", "iso8859_1"]) return # main test_suite(); sys.exit(0) if __name__ == "__main__": import fileinput for s in fileinput.input(): - print guess(s) + print guess(s) diff --git a/modules/elmsubmit/lib/encdet_utf8.py.wml b/modules/elmsubmit/lib/encdet_utf8.py.wml deleted file mode 100644 index e70810cb3..000000000 --- a/modules/elmsubmit/lib/encdet_utf8.py.wml +++ /dev/null @@ -1,344 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf8 -*- - -# Converted from the original japanese.ms932 encoding to utf8. - -# encdet.py - An encoding detector -# by Yusuke Shinyama -# * public domain * - -import sys, re - - -## EncodingRecognizer -## - a finite automaton which receives octets -## -class EncodingRecognizer: - - SCORE_DEFAULT = 0.5 - DEATH_PENALTY = -100 - GIVEUP_THRESHOLD = -1000 - - # character sets: must be exclusive! - CHARSET = [ - # zenkaku-kana - (1.5, re.compile(u"[ã-ã‚“]"), 0x01), - (1.5, re.compile(u"[ã‚¡-ヴ]"), 0x02), - (1.0, re.compile(u"[ーヽヾã‚ã‚ž]"), 0x03), - - # hankaku latin - (1.2, re.compile(u"[a-zA-Z0-9]"), 0x04), - (0.0, re.compile(u"[\u00c0-\u00ff]"), 0x04), - - # hankaku-kana - (0.8, re.compile(u"[\uff66-\uff9d]"), 0x08), - - # zenkaku-alphanum - (1.2, re.compile(u"[A-Zï½-zï¼-ï¼™]"), 0x10), - - # kanji - (1.0, re.compile(u"[\u4e00-\u9fff]"), 0x20), - - ] - - def __init__(self, encoding): - self.encoding = encoding - self.ch = "" - self.state = 1 - self.partial_score = 0.0 - self.total_score = 0.0 - self.chunk_type = 0 - return - - def __repr__(self): - return "" % \ - (self.encoding, self.state, self.chunk_type, self.partial_score, self.total_score) - - def die(self): - #print "died:", self - self.total_score += self.DEATH_PENALTY - if self.total_score <= self.GIVEUP_THRESHOLD: - # game is over... - #print "giveup:", self - self.state = 0 - else: - # try again... - self.state = 1 - self.partial_score = 0 - self.ch = "" - return - - def flush(self): - self.total_score += self.partial_score * self.partial_score - self.partial_score = 0.0 - return - - def accept(self, s): - try: - c = unicode(s, self.encoding) - except UnicodeError: - c = "" - for (score, pat, flags) in self.CHARSET: - if pat.match(c): - if self.chunk_type == 0 or not (self.chunk_type & flags): - self.flush() - self.chunk_type = flags - self.partial_score += score - break - else: - self.flush() - self.chunk_type = 0 - self.partial_score += self.SCORE_DEFAULT - return - - def finish(self): - self.flush() - if 1 < self.state: - self.die() - return - - -## CHARACTER SETS - - -## ISO-8859-* -## -class ISO8859_Recognizer(EncodingRecognizer): - - def __init__(self): - return EncodingRecognizer.__init__(self, "iso8859_1") - - def feed(self, c): - if self.state == 0: # already dead? - return - - elif self.state == 1: # ascii or iso? - if c < 0x7f or (0xa0 <= c and c <= 0xff): - self.state = 1 - self.accept(chr(c)) - - else: - self.die() - - return - - -## EUC-JP -## -class EUCJP_Recognizer(EncodingRecognizer): - - def __init__(self): - self.hankaku = False - return EncodingRecognizer.__init__(self, "japanese.euc_jp") - - def feed(self, c): - if self.state == 0: # already dead? - return - - # 1stbyte - elif self.state == 1: - if c < 0x7f: # ascii? - # succeed - self.state = 1 - self.accept(chr(c)) - self.ch = "" -# IGNORE EUC-JP hankaku chars, no one is using -# elif 0x8e == c: # hankaku-kana 1stbyte? -# # next -# self.state = 2 -# self.ch = chr(c) -# self.hankaku = True - elif 0xa1 <= c and c <= 0xfe: # kanji 1stbyte? - # next - self.state = 2 - self.ch = chr(c) - self.hankaku = False - else: - self.die() - - # 2ndbyte - elif self.state == 2: - if self.hankaku and (0xa1 <= c and c <= 0xdf): # hankaku-kana 2ndbyte? - # succeed - self.ch += chr(c) - self.accept(self.ch) - self.state = 1 - self.ch = "" - elif not self.hankaku and (0xa1 <= c and c <= 0xfe): # kanji 2ndbyte? - # succeed - self.ch += chr(c) - self.accept(self.ch) - self.state = 1 - self.ch = "" - else: - self.die() - - return - - -## CP932 -## -class CP932_Recognizer(EncodingRecognizer): - - def __init__(self): - return EncodingRecognizer.__init__(self, "japanese.ms932") - - def feed(self, c): - if self.state == 0: # already dead? - return - - # 1stbyte - elif self.state == 1: - if c < 0x7f: # ascii? - # succeed - self.state = 1 - self.accept(chr(c)) - self.ch = "" - elif 0xa1 <= c and c <= 0xdf: # hankaku-kana? - # succeed - self.state = 1 - self.accept(chr(c)) - self.ch = "" - elif (0x81 <= c and c <= 0x9f) or (0xe0 <= c and c <= 0xee) \ - or (0xfa <= c and c <= 0xfc): # kanji 1stbyte? - # next - self.state = 2 - self.ch = chr(c) - else: - self.die() - - # 2ndbyte - elif self.state == 2: - if 0x40 <= c and c <= 0xfc and c != 0x7f: # kanji 2ndbyte? - # succeed - self.accept(self.ch+chr(c)) - self.state = 1 - self.ch = "" - else: - self.die() - - return - - -## UTF-8 -## -class UTF8_Recognizer(EncodingRecognizer): - - def __init__(self): - self.left = 0 - return EncodingRecognizer.__init__(self, "utf8") - - def feed(self, c): - if self.state == 0: # already dead? - return - - # 1stbyte - elif self.state == 1: - if c <= 0x7f: # 00xxxxxx: 1byte only? - # succeed - self.state = 1 - self.accept(chr(c)) - self.ch = "" - elif c & 0xe0 == 0xc0: # 110xxxxx: 2bytes - # next - self.state = 2 - self.left = 1 - self.ch = chr(c) - elif c & 0xf0 == 0xe0: # 1110xxxx: 3bytes - # next - self.state = 2 - self.left = 2 - self.ch = chr(c) - elif c & 0xf8 == 0xf0: # 11110xxx: 4bytes - # next - self.state = 2 - self.left = 3 - self.ch = chr(c) - elif c & 0xfc == 0xf8: # 111110xx: 5bytes - # next - self.state = 2 - self.left = 4 - self.ch = chr(c) - else: - self.die() - - # n-th byte (where 2<=n) - else: - if c & 0xc0 == 0x80: # 10xxxxxx: continuous? - self.state += 1 - self.left -= 1 - self.ch += chr(c) - if self.left == 0: # finished? - # succeed - self.state = 1 - self.accept(self.ch) - self.ch = "" - else: - # next - pass - else: - self.die() - - return - - -# guess -def guess(s): - recognizer = [ - EUCJP_Recognizer(), - CP932_Recognizer(), - ISO8859_Recognizer(), - UTF8_Recognizer() - ] - for c in s: - for r in recognizer: - r.feed(ord(c)) - for r in recognizer: - r.finish() - #print r - recognizer.sort(lambda a,b: cmp(b.total_score, a.total_score)) - return recognizer[0].encoding - -# test suite -def test(s0, test_encodings): - false_encodings = [ "japanese.euc_jp", "japanese.ms932", "utf8", "iso8859_1" ] - for enc1 in test_encodings: - try: - s = s0.encode(enc1) - except UnicodeError: - continue - print "try '%s' in %s (%s)" % (s0.encode('utf8'), enc1.encode('utf8'), " ".join(map(lambda c:"%02x" % ord(c), s))) - for enc2 in false_encodings: - if enc1 != enc2: - try: - x = str(unicode(s, enc2)) - print " (could be: '%s' in %s)" % (x, enc2) - except UnicodeError: - continue - genc = guess(s) - if genc == enc1: - print " CORRECT:", genc - else: - print " ! INCORRECT:", genc - print - return - -def test_suite(): - # kana only - test(u"ã“ã‚“ã«ã¡ã¯", ["japanese.euc_jp", "japanese.ms932", "utf8"]) - # kana + alphanum - test(u"Aã¯Bã¨Cã§ã‚ã‚‹", ["japanese.euc_jp", "japanese.ms932", "utf8"]) - # kana + kanji - test(u"毎æœæ–°èžãƒ‹ãƒ¥ãƒ¼ã‚¹", ["japanese.euc_jp", "japanese.ms932", "utf8"]) - # kanji + hankakukana - test(u"無題ドキュメï¾ï¾„", ["japanese.ms932", "utf8"]) - # iso8859-1 - test(u"Enzyklop\u00e4die", ["utf8", "iso8859_1"]) - return - -# main -test_suite(); sys.exit(0) -if __name__ == "__main__": - import fileinput - for s in fileinput.input(): - print guess(s) diff --git a/modules/elmsubmit/lib/lex.py b/modules/elmsubmit/lib/lex.py index 02248cbce..4a5f58d71 100644 --- a/modules/elmsubmit/lib/lex.py +++ b/modules/elmsubmit/lib/lex.py @@ -1,702 +1,699 @@ -#----------------------------------------------------------------------------- +#----------------------------------------------------------------------------- # ply: lex.py # # Author: David M. Beazley (beazley@cs.uchicago.edu) # Department of Computer Science # University of Chicago # Chicago, IL 60637 # # Copyright (C) 2001, David M. Beazley # # $Header$ # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # See the file COPYING for a complete copy of the LGPL. # # # This module automatically constructs a lexical analysis module from regular # expression rules defined in a user-defined module. The idea is essentially the same # as that used in John Aycock's Spark framework, but the implementation works # at the module level rather than requiring the use of classes. # # This module tries to provide an interface that is closely modeled after # the traditional lex interface in Unix. It also differs from Spark # in that: # # - It provides more extensive error checking and reporting if # the user supplies a set of regular expressions that can't # be compiled or if there is any other kind of a problem in # the specification. # # - The interface is geared towards LALR(1) and LR(1) parser # generators. That is tokens are generated one at a time # rather than being generated in advanced all in one step. # # There are a few limitations of this module # # - The module interface makes it somewhat awkward to support more # than one lexer at a time. Although somewhat inelegant from a # design perspective, this is rarely a practical concern for # most compiler projects. # # - The lexer requires that the entire input text be read into # a string before scanning. I suppose that most machines have # enough memory to make this a minor issues, but it makes # the lexer somewhat difficult to use in interactive sessions # or with streaming data. # #----------------------------------------------------------------------------- r""" lex.py This module builds lex-like scanners based on regular expression rules. To use the module, simply write a collection of regular expression rules and actions like this: # lexer.py import lex # Define a list of valid tokens tokens = ( 'IDENTIFIER', 'NUMBER', 'PLUS', 'MINUS' ) # Define tokens as functions def t_IDENTIFIER(t): r' ([a-zA-Z_](\w|_)* ' return t def t_NUMBER(t): r' \d+ ' return t # Some simple tokens with no actions t_PLUS = r'\+' t_MINUS = r'-' # Initialize the lexer lex.lex() The tokens list is required and contains a complete list of all valid token types that the lexer is allowed to produce. Token types are restricted to be valid identifiers. This means that 'MINUS' is a valid token type whereas '-' is not. Rules are defined by writing a function with a name of the form t_rulename. Each rule must accept a single argument which is a token object generated by the lexer. This token has the following attributes: t.type = type string of the token. This is initially set to the name of the rule without the leading t_ t.value = The value of the lexeme. t.lineno = The value of the line number where the token was encountered For example, the t_NUMBER() rule above might be called with the following: t.type = 'NUMBER' t.value = '42' t.lineno = 3 Each rule returns the token object it would like to supply to the parser. In most cases, the token t is returned with few, if any modifications. To discard a token for things like whitespace or comments, simply return nothing. For instance: def t_whitespace(t): r' \s+ ' pass For faster lexing, you can also define this in terms of the ignore set like this: t_ignore = ' \t' The characters in this string are ignored by the lexer. Use of this feature can speed up parsing significantly since scanning will immediately proceed to the next token. lex requires that the token returned by each rule has an attribute t.type. Other than this, rules are free to return any kind of token object that they wish and may construct a new type of token object from the attributes of t (provided the new object has the required type attribute). If illegal characters are encountered, the scanner executes the function t_error(t) where t is a token representing the rest of the string that hasn't been matched. If this function isn't defined, a LexError exception is raised. The .text attribute of this exception object contains the part of the string that wasn't matched. The t.skip(n) method can be used to skip ahead n characters in the input stream. This is usually only used in the error handling rule. For instance, the following rule would print an error message and continue: def t_error(t): print "Illegal character in input %s" % t.value[0] t.skip(1) Of course, a nice scanner might wish to skip more than one character if the input looks very corrupted. The lex module defines a t.lineno attribute on each token that can be used to track the current line number in the input. The value of this variable is not modified by lex so it is up to your lexer module to correctly update its value depending on the lexical properties of the input language. To do this, you might write rules such as the following: def t_newline(t): r' \n+ ' t.lineno += t.value.count("\n") To initialize your lexer so that it can be used, simply call the lex.lex() function in your rule file. If there are any errors in your specification, warning messages or an exception will be generated to alert you to the problem. (dave: this needs to be rewritten) To use the newly constructed lexer from another module, simply do this: import lex import lexer plex.input("position = initial + rate*60") while 1: token = plex.token() # Get a token if not token: break # No more tokens ... do whatever ... Assuming that the module 'lexer' has initialized plex as shown above, parsing modules can safely import 'plex' without having to import the rule file or any additional imformation about the scanner you have defined. """ # ----------------------------------------------------------------------------- __version__ = "1.4" import re, types, sys, copy # Exception thrown when invalid token encountered and no default class LexError(Exception): def __init__(self,message,s): self.args = (message,) self.text = s # Token class class LexToken: def __str__(self): return "LexToken(%s,%r,%d)" % (self.type,self.value,self.lineno) def __repr__(self): return str(self) def skip(self,n): try: self._skipn += n except AttributeError: self._skipn = n # ----------------------------------------------------------------------------- # Lexer class # # input() - Store a new string in the lexer # token() - Get the next token # ----------------------------------------------------------------------------- class Lexer: def __init__(self): self.lexre = None # Master regular expression self.lexdata = None # Actual input data (as a string) self.lexpos = 0 # Current position in input text self.lexlen = 0 # Length of the input text self.lexindexfunc = [ ] # Reverse mapping of groups to functions and types self.lexerrorf = None # Error rule (if any) self.lextokens = None # List of valid tokens self.lexignore = None # Ignored characters self.lineno = 1 # Current line number self.debug = 0 # Debugging mode self.optimize = 0 # Optimized mode self.token = self.errtoken def __copy__(self): c = Lexer() c.lexre = self.lexre c.lexdata = self.lexdata c.lexpos = self.lexpos c.lexlen = self.lexlen c.lenindexfunc = self.lexindexfunc c.lexerrorf = self.lexerrorf c.lextokens = self.lextokens c.lexignore = self.lexignore c.lineno = self.lineno c.optimize = self.optimize c.token = c.realtoken # ------------------------------------------------------------ # input() - Push a new string into the lexer # ------------------------------------------------------------ def input(self,s): if not isinstance(s,types.StringType): raise ValueError, "Expected a string" self.lexdata = s self.lexpos = 0 self.lexlen = len(s) self.token = self.realtoken # Change the token routine to point to realtoken() global token if token == self.errtoken: token = self.token # ------------------------------------------------------------ # errtoken() - Return error if token is called with no data # ------------------------------------------------------------ def errtoken(self): raise RuntimeError, "No input string given with input()" # ------------------------------------------------------------ # token() - Return the next token from the Lexer # # Note: This function has been carefully implemented to be as fast # as possible. Don't make changes unless you really know what # you are doing # ------------------------------------------------------------ def realtoken(self): # Make local copies of frequently referenced attributes lexpos = self.lexpos lexlen = self.lexlen lexignore = self.lexignore lexdata = self.lexdata while lexpos < lexlen: # This code provides some short-circuit code for whitespace, tabs, and other ignored characters if lexdata[lexpos] in lexignore: lexpos += 1 continue # Look for a regular expression match m = self.lexre.match(lexdata,lexpos) if m: i = m.lastindex lexpos = m.end() tok = LexToken() tok.value = m.group() tok.lineno = self.lineno tok.lexer = self func,tok.type = self.lexindexfunc[i] if not func: self.lexpos = lexpos return tok # If token is processed by a function, call it self.lexpos = lexpos newtok = func(tok) self.lineno = tok.lineno # Update line number # Every function must return a token, if nothing, we just move to next token if not newtok: continue # Verify type of the token. If not in the token map, raise an error if not self.optimize: if not self.lextokens.has_key(newtok.type): raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( func.func_code.co_filename, func.func_code.co_firstlineno, func.__name__, newtok.type),lexdata[lexpos:]) return newtok # No match. Call t_error() if defined. if self.lexerrorf: tok = LexToken() tok.value = self.lexdata[lexpos:] tok.lineno = self.lineno tok.type = "error" tok.lexer = self oldpos = lexpos newtok = self.lexerrorf(tok) lexpos += getattr(tok,"_skipn",0) if oldpos == lexpos: # Error method didn't change text position at all. This is an error. self.lexpos = lexpos raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) if not newtok: continue self.lexpos = lexpos return newtok self.lexpos = lexpos raise LexError, ("No match found", lexdata[lexpos:]) # No more input data self.lexpos = lexpos + 1 return None # ----------------------------------------------------------------------------- # validate_file() # # This checks to see if there are duplicated t_rulename() functions or strings # in the parser input file. This is done using a simple regular expression # match on each line in the filename. # ----------------------------------------------------------------------------- def validate_file(filename): import os.path base,ext = os.path.splitext(filename) if ext != '.py': return 1 # No idea what the file is. Return OK try: f = open(filename) lines = f.readlines() f.close() except IOError: return 1 # Oh well fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') counthash = { } linen = 1 noerror = 1 for l in lines: m = fre.match(l) if not m: m = sre.match(l) if m: name = m.group(1) prev = counthash.get(name) if not prev: counthash[name] = linen else: print "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev) noerror = 0 linen += 1 return noerror # ----------------------------------------------------------------------------- # _read_lextab(module) # # Reads lexer table from a lextab file instead of using introspection. # ----------------------------------------------------------------------------- def _read_lextab(lexer, fdict, module): exec "import %s as lextab" % module lexer.lexre = re.compile(lextab._lexre, re.VERBOSE) lexer.lexindexfunc = lextab._lextab for i in range(len(lextab._lextab)): t = lexer.lexindexfunc[i] if t: if t[0]: lexer.lexindexfunc[i] = (fdict[t[0]],t[1]) lexer.lextokens = lextab._lextokens lexer.lexignore = lextab._lexignore if lextab._lexerrorf: lexer.lexerrorf = fdict[lextab._lexerrorf] # ----------------------------------------------------------------------------- # lex(module) # # Build all of the regular expression rules from definitions in the supplied module # ----------------------------------------------------------------------------- def lex(module=None,debug=0,optimize=0,lextab="lextab"): ldict = None regex = "" error = 0 files = { } lexer = Lexer() lexer.debug = debug lexer.optimize = optimize global token,input if module: # User supplied a module object. if isinstance(module, types.ModuleType): ldict = module.__dict__ elif isinstance(module, types.InstanceType): _items = [(k,getattr(module,k)) for k in dir(module)] ldict = { } for (i,v) in _items: ldict[i] = v else: raise ValueError,"Expected a module or instance" else: # No module given. We might be able to get information from the caller. try: raise RuntimeError except RuntimeError: e,b,t = sys.exc_info() f = t.tb_frame f = f.f_back # Walk out to our calling function ldict = f.f_globals # Grab its globals dictionary if optimize and lextab: try: _read_lextab(lexer,ldict, lextab) if not lexer.lexignore: lexer.lexignore = "" token = lexer.token input = lexer.input return lexer except ImportError: pass # Get the tokens map if (module and isinstance(module,types.InstanceType)): tokens = getattr(module,"tokens",None) else: try: tokens = ldict["tokens"] except KeyError: tokens = None if not tokens: raise SyntaxError,"lex: module does not define 'tokens'" if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)): raise SyntaxError,"lex: tokens must be a list or tuple." # Build a dictionary of valid token names lexer.lextokens = { } if not optimize: # Utility function for verifying tokens def is_identifier(s): for c in s: if not (c.isalnum() or c == '_'): return 0 return 1 for n in tokens: if not is_identifier(n): print "lex: Bad token name '%s'" % n error = 1 if lexer.lextokens.has_key(n): print "lex: Warning. Token '%s' multiply defined." % n lexer.lextokens[n] = None else: for n in tokens: lexer.lextokens[n] = None if debug: print "lex: tokens = '%s'" % lexer.lextokens.keys() # Get a list of symbols with the t_ prefix tsymbols = [f for f in ldict.keys() if f[:2] == 't_'] # Now build up a list of functions and a list of strings fsymbols = [ ] ssymbols = [ ] for f in tsymbols: if callable(ldict[f]): fsymbols.append(ldict[f]) elif isinstance(ldict[f], types.StringType): ssymbols.append((f,ldict[f])) else: print "lex: %s not defined as a function or string" % f error = 1 # Sort the functions by line number fsymbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno)) # Sort the strings by regular expression length ssymbols.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) # Check for non-empty symbols if len(fsymbols) == 0 and len(ssymbols) == 0: raise SyntaxError,"lex: no rules of the form t_rulename are defined." # Add all of the rules defined with actions first for f in fsymbols: line = f.func_code.co_firstlineno file = f.func_code.co_filename files[file] = None ismethod = isinstance(f, types.MethodType) if not optimize: nargs = f.func_code.co_argcount if ismethod: reqargs = 2 else: reqargs = 1 if nargs > reqargs: print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__) error = 1 continue if nargs < reqargs: print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__) error = 1 continue if f.__name__ == 't_ignore': print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__) error = 1 continue if f.__name__ == 't_error': lexer.lexerrorf = f continue if f.__doc__: if not optimize: try: c = re.compile(f.__doc__, re.VERBOSE) except re.error,e: print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e) error = 1 continue if debug: print "lex: Adding rule %s -> '%s'" % (f.__name__,f.__doc__) # Okay. The regular expression seemed okay. Let's append it to the master regular # expression we're building if (regex): regex += "|" regex += "(?P<%s>%s)" % (f.__name__,f.__doc__) else: print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__) # Now add all of the simple rules for name,r in ssymbols: if name == 't_ignore': lexer.lexignore = r continue if not optimize: if name == 't_error': raise SyntaxError,"lex: Rule 't_error' must be defined as a function" error = 1 continue if not lexer.lextokens.has_key(name[2:]): print "lex: Rule '%s' defined for an unspecified token %s." % (name,name[2:]) error = 1 continue try: c = re.compile(r,re.VERBOSE) except re.error,e: print "lex: Invalid regular expression for rule '%s'. %s" % (name,e) error = 1 continue if debug: print "lex: Adding rule %s -> '%s'" % (name,r) if regex: regex += "|" regex += "(?P<%s>%s)" % (name,r) if not optimize: for f in files.keys(): if not validate_file(f): error = 1 try: if debug: print "lex: regex = '%s'" % regex lexer.lexre = re.compile(regex, re.VERBOSE) # Build the index to function map for the matching engine lexer.lexindexfunc = [ None ] * (max(lexer.lexre.groupindex.values())+1) for f,i in lexer.lexre.groupindex.items(): handle = ldict[f] if type(handle) in (types.FunctionType, types.MethodType): lexer.lexindexfunc[i] = (handle,handle.__name__[2:]) else: # If rule was specified as a string, we build an anonymous # callback function to carry out the action lexer.lexindexfunc[i] = (None,f[2:]) # If a lextab was specified, we create a file containing the precomputed # regular expression and index table if lextab and optimize: lt = open(lextab+".py","w") lt.write("# %s.py. This file automatically created by PLY. Don't edit.\n" % lextab) lt.write("_lexre = %s\n" % repr(regex)) lt.write("_lextab = [\n"); for i in range(0,len(lexer.lexindexfunc)): t = lexer.lexindexfunc[i] if t: if t[0]: lt.write(" ('%s',%s),\n"% (t[0].__name__, repr(t[1]))) else: lt.write(" (None,%s),\n" % repr(t[1])) else: lt.write(" None,\n") lt.write("]\n"); lt.write("_lextokens = %s\n" % repr(lexer.lextokens)) lt.write("_lexignore = %s\n" % repr(lexer.lexignore)) if (lexer.lexerrorf): lt.write("_lexerrorf = %s\n" % repr(lexer.lexerrorf.__name__)) else: lt.write("_lexerrorf = None\n") lt.close() except re.error,e: print "lex: Fatal error. Unable to compile regular expression rules. %s" % e error = 1 if error: raise SyntaxError,"lex: Unable to build lexer." if not lexer.lexerrorf: print "lex: Warning. no t_error rule is defined." if not lexer.lexignore: lexer.lexignore = "" # Create global versions of the token() and input() functions token = lexer.token input = lexer.input return lexer # ----------------------------------------------------------------------------- # run() # # This runs the lexer as a main program # ----------------------------------------------------------------------------- def runmain(lexer=None,data=None): if not data: try: filename = sys.argv[1] f = open(filename) data = f.read() f.close() except IndexError: print "Reading from standard input (type EOF to end):" data = sys.stdin.read() if lexer: _input = lexer.input else: _input = input _input(data) if lexer: _token = lexer.token else: _token = token while 1: tok = _token() if not tok: break print "(%s,'%s',%d)" % (tok.type, tok.value, tok.lineno) - - - diff --git a/modules/elmsubmit/lib/lex.py.wml b/modules/elmsubmit/lib/lex.py.wml deleted file mode 100644 index 02248cbce..000000000 --- a/modules/elmsubmit/lib/lex.py.wml +++ /dev/null @@ -1,702 +0,0 @@ -#----------------------------------------------------------------------------- -# ply: lex.py -# -# Author: David M. Beazley (beazley@cs.uchicago.edu) -# Department of Computer Science -# University of Chicago -# Chicago, IL 60637 -# -# Copyright (C) 2001, David M. Beazley -# -# $Header$ -# -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License, or (at your option) any later version. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -# -# See the file COPYING for a complete copy of the LGPL. -# -# -# This module automatically constructs a lexical analysis module from regular -# expression rules defined in a user-defined module. The idea is essentially the same -# as that used in John Aycock's Spark framework, but the implementation works -# at the module level rather than requiring the use of classes. -# -# This module tries to provide an interface that is closely modeled after -# the traditional lex interface in Unix. It also differs from Spark -# in that: -# -# - It provides more extensive error checking and reporting if -# the user supplies a set of regular expressions that can't -# be compiled or if there is any other kind of a problem in -# the specification. -# -# - The interface is geared towards LALR(1) and LR(1) parser -# generators. That is tokens are generated one at a time -# rather than being generated in advanced all in one step. -# -# There are a few limitations of this module -# -# - The module interface makes it somewhat awkward to support more -# than one lexer at a time. Although somewhat inelegant from a -# design perspective, this is rarely a practical concern for -# most compiler projects. -# -# - The lexer requires that the entire input text be read into -# a string before scanning. I suppose that most machines have -# enough memory to make this a minor issues, but it makes -# the lexer somewhat difficult to use in interactive sessions -# or with streaming data. -# -#----------------------------------------------------------------------------- - -r""" -lex.py - -This module builds lex-like scanners based on regular expression rules. -To use the module, simply write a collection of regular expression rules -and actions like this: - -# lexer.py -import lex - -# Define a list of valid tokens -tokens = ( - 'IDENTIFIER', 'NUMBER', 'PLUS', 'MINUS' - ) - -# Define tokens as functions -def t_IDENTIFIER(t): - r' ([a-zA-Z_](\w|_)* ' - return t - -def t_NUMBER(t): - r' \d+ ' - return t - -# Some simple tokens with no actions -t_PLUS = r'\+' -t_MINUS = r'-' - -# Initialize the lexer -lex.lex() - -The tokens list is required and contains a complete list of all valid -token types that the lexer is allowed to produce. Token types are -restricted to be valid identifiers. This means that 'MINUS' is a valid -token type whereas '-' is not. - -Rules are defined by writing a function with a name of the form -t_rulename. Each rule must accept a single argument which is -a token object generated by the lexer. This token has the following -attributes: - - t.type = type string of the token. This is initially set to the - name of the rule without the leading t_ - t.value = The value of the lexeme. - t.lineno = The value of the line number where the token was encountered - -For example, the t_NUMBER() rule above might be called with the following: - - t.type = 'NUMBER' - t.value = '42' - t.lineno = 3 - -Each rule returns the token object it would like to supply to the -parser. In most cases, the token t is returned with few, if any -modifications. To discard a token for things like whitespace or -comments, simply return nothing. For instance: - -def t_whitespace(t): - r' \s+ ' - pass - -For faster lexing, you can also define this in terms of the ignore set like this: - -t_ignore = ' \t' - -The characters in this string are ignored by the lexer. Use of this feature can speed -up parsing significantly since scanning will immediately proceed to the next token. - -lex requires that the token returned by each rule has an attribute -t.type. Other than this, rules are free to return any kind of token -object that they wish and may construct a new type of token object -from the attributes of t (provided the new object has the required -type attribute). - -If illegal characters are encountered, the scanner executes the -function t_error(t) where t is a token representing the rest of the -string that hasn't been matched. If this function isn't defined, a -LexError exception is raised. The .text attribute of this exception -object contains the part of the string that wasn't matched. - -The t.skip(n) method can be used to skip ahead n characters in the -input stream. This is usually only used in the error handling rule. -For instance, the following rule would print an error message and -continue: - -def t_error(t): - print "Illegal character in input %s" % t.value[0] - t.skip(1) - -Of course, a nice scanner might wish to skip more than one character -if the input looks very corrupted. - -The lex module defines a t.lineno attribute on each token that can be used -to track the current line number in the input. The value of this -variable is not modified by lex so it is up to your lexer module -to correctly update its value depending on the lexical properties -of the input language. To do this, you might write rules such as -the following: - -def t_newline(t): - r' \n+ ' - t.lineno += t.value.count("\n") - -To initialize your lexer so that it can be used, simply call the lex.lex() -function in your rule file. If there are any errors in your -specification, warning messages or an exception will be generated to -alert you to the problem. - -(dave: this needs to be rewritten) -To use the newly constructed lexer from another module, simply do -this: - - import lex - import lexer - plex.input("position = initial + rate*60") - - while 1: - token = plex.token() # Get a token - if not token: break # No more tokens - ... do whatever ... - -Assuming that the module 'lexer' has initialized plex as shown -above, parsing modules can safely import 'plex' without having -to import the rule file or any additional imformation about the -scanner you have defined. -""" - -# ----------------------------------------------------------------------------- - - -__version__ = "1.4" - -import re, types, sys, copy - -# Exception thrown when invalid token encountered and no default -class LexError(Exception): - def __init__(self,message,s): - self.args = (message,) - self.text = s - -# Token class -class LexToken: - def __str__(self): - return "LexToken(%s,%r,%d)" % (self.type,self.value,self.lineno) - def __repr__(self): - return str(self) - def skip(self,n): - try: - self._skipn += n - except AttributeError: - self._skipn = n - -# ----------------------------------------------------------------------------- -# Lexer class -# -# input() - Store a new string in the lexer -# token() - Get the next token -# ----------------------------------------------------------------------------- - -class Lexer: - def __init__(self): - self.lexre = None # Master regular expression - self.lexdata = None # Actual input data (as a string) - self.lexpos = 0 # Current position in input text - self.lexlen = 0 # Length of the input text - self.lexindexfunc = [ ] # Reverse mapping of groups to functions and types - self.lexerrorf = None # Error rule (if any) - self.lextokens = None # List of valid tokens - self.lexignore = None # Ignored characters - self.lineno = 1 # Current line number - self.debug = 0 # Debugging mode - self.optimize = 0 # Optimized mode - self.token = self.errtoken - - def __copy__(self): - c = Lexer() - c.lexre = self.lexre - c.lexdata = self.lexdata - c.lexpos = self.lexpos - c.lexlen = self.lexlen - c.lenindexfunc = self.lexindexfunc - c.lexerrorf = self.lexerrorf - c.lextokens = self.lextokens - c.lexignore = self.lexignore - c.lineno = self.lineno - c.optimize = self.optimize - c.token = c.realtoken - - # ------------------------------------------------------------ - # input() - Push a new string into the lexer - # ------------------------------------------------------------ - def input(self,s): - if not isinstance(s,types.StringType): - raise ValueError, "Expected a string" - self.lexdata = s - self.lexpos = 0 - self.lexlen = len(s) - self.token = self.realtoken - - # Change the token routine to point to realtoken() - global token - if token == self.errtoken: - token = self.token - - # ------------------------------------------------------------ - # errtoken() - Return error if token is called with no data - # ------------------------------------------------------------ - def errtoken(self): - raise RuntimeError, "No input string given with input()" - - # ------------------------------------------------------------ - # token() - Return the next token from the Lexer - # - # Note: This function has been carefully implemented to be as fast - # as possible. Don't make changes unless you really know what - # you are doing - # ------------------------------------------------------------ - def realtoken(self): - # Make local copies of frequently referenced attributes - lexpos = self.lexpos - lexlen = self.lexlen - lexignore = self.lexignore - lexdata = self.lexdata - - while lexpos < lexlen: - # This code provides some short-circuit code for whitespace, tabs, and other ignored characters - if lexdata[lexpos] in lexignore: - lexpos += 1 - continue - - # Look for a regular expression match - m = self.lexre.match(lexdata,lexpos) - if m: - i = m.lastindex - lexpos = m.end() - tok = LexToken() - tok.value = m.group() - tok.lineno = self.lineno - tok.lexer = self - func,tok.type = self.lexindexfunc[i] - if not func: - self.lexpos = lexpos - return tok - - # If token is processed by a function, call it - self.lexpos = lexpos - newtok = func(tok) - self.lineno = tok.lineno # Update line number - - # Every function must return a token, if nothing, we just move to next token - if not newtok: continue - - # Verify type of the token. If not in the token map, raise an error - if not self.optimize: - if not self.lextokens.has_key(newtok.type): - raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( - func.func_code.co_filename, func.func_code.co_firstlineno, - func.__name__, newtok.type),lexdata[lexpos:]) - - return newtok - - # No match. Call t_error() if defined. - if self.lexerrorf: - tok = LexToken() - tok.value = self.lexdata[lexpos:] - tok.lineno = self.lineno - tok.type = "error" - tok.lexer = self - oldpos = lexpos - newtok = self.lexerrorf(tok) - lexpos += getattr(tok,"_skipn",0) - if oldpos == lexpos: - # Error method didn't change text position at all. This is an error. - self.lexpos = lexpos - raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) - if not newtok: continue - self.lexpos = lexpos - return newtok - - self.lexpos = lexpos - raise LexError, ("No match found", lexdata[lexpos:]) - - # No more input data - self.lexpos = lexpos + 1 - return None - - -# ----------------------------------------------------------------------------- -# validate_file() -# -# This checks to see if there are duplicated t_rulename() functions or strings -# in the parser input file. This is done using a simple regular expression -# match on each line in the filename. -# ----------------------------------------------------------------------------- - -def validate_file(filename): - import os.path - base,ext = os.path.splitext(filename) - if ext != '.py': return 1 # No idea what the file is. Return OK - - try: - f = open(filename) - lines = f.readlines() - f.close() - except IOError: - return 1 # Oh well - - fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') - sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') - counthash = { } - linen = 1 - noerror = 1 - for l in lines: - m = fre.match(l) - if not m: - m = sre.match(l) - if m: - name = m.group(1) - prev = counthash.get(name) - if not prev: - counthash[name] = linen - else: - print "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev) - noerror = 0 - linen += 1 - return noerror - -# ----------------------------------------------------------------------------- -# _read_lextab(module) -# -# Reads lexer table from a lextab file instead of using introspection. -# ----------------------------------------------------------------------------- - -def _read_lextab(lexer, fdict, module): - exec "import %s as lextab" % module - lexer.lexre = re.compile(lextab._lexre, re.VERBOSE) - lexer.lexindexfunc = lextab._lextab - for i in range(len(lextab._lextab)): - t = lexer.lexindexfunc[i] - if t: - if t[0]: - lexer.lexindexfunc[i] = (fdict[t[0]],t[1]) - lexer.lextokens = lextab._lextokens - lexer.lexignore = lextab._lexignore - if lextab._lexerrorf: - lexer.lexerrorf = fdict[lextab._lexerrorf] - -# ----------------------------------------------------------------------------- -# lex(module) -# -# Build all of the regular expression rules from definitions in the supplied module -# ----------------------------------------------------------------------------- -def lex(module=None,debug=0,optimize=0,lextab="lextab"): - ldict = None - regex = "" - error = 0 - files = { } - lexer = Lexer() - lexer.debug = debug - lexer.optimize = optimize - global token,input - - if module: - # User supplied a module object. - if isinstance(module, types.ModuleType): - ldict = module.__dict__ - elif isinstance(module, types.InstanceType): - _items = [(k,getattr(module,k)) for k in dir(module)] - ldict = { } - for (i,v) in _items: - ldict[i] = v - else: - raise ValueError,"Expected a module or instance" - - else: - # No module given. We might be able to get information from the caller. - try: - raise RuntimeError - except RuntimeError: - e,b,t = sys.exc_info() - f = t.tb_frame - f = f.f_back # Walk out to our calling function - ldict = f.f_globals # Grab its globals dictionary - - if optimize and lextab: - try: - _read_lextab(lexer,ldict, lextab) - if not lexer.lexignore: lexer.lexignore = "" - token = lexer.token - input = lexer.input - return lexer - - except ImportError: - pass - - # Get the tokens map - if (module and isinstance(module,types.InstanceType)): - tokens = getattr(module,"tokens",None) - else: - try: - tokens = ldict["tokens"] - except KeyError: - tokens = None - - if not tokens: - raise SyntaxError,"lex: module does not define 'tokens'" - if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)): - raise SyntaxError,"lex: tokens must be a list or tuple." - - # Build a dictionary of valid token names - lexer.lextokens = { } - if not optimize: - - # Utility function for verifying tokens - def is_identifier(s): - for c in s: - if not (c.isalnum() or c == '_'): return 0 - return 1 - - for n in tokens: - if not is_identifier(n): - print "lex: Bad token name '%s'" % n - error = 1 - if lexer.lextokens.has_key(n): - print "lex: Warning. Token '%s' multiply defined." % n - lexer.lextokens[n] = None - else: - for n in tokens: lexer.lextokens[n] = None - - - if debug: - print "lex: tokens = '%s'" % lexer.lextokens.keys() - - # Get a list of symbols with the t_ prefix - tsymbols = [f for f in ldict.keys() if f[:2] == 't_'] - - # Now build up a list of functions and a list of strings - fsymbols = [ ] - ssymbols = [ ] - for f in tsymbols: - if callable(ldict[f]): - fsymbols.append(ldict[f]) - elif isinstance(ldict[f], types.StringType): - ssymbols.append((f,ldict[f])) - else: - print "lex: %s not defined as a function or string" % f - error = 1 - - # Sort the functions by line number - fsymbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno)) - - # Sort the strings by regular expression length - ssymbols.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) - - # Check for non-empty symbols - if len(fsymbols) == 0 and len(ssymbols) == 0: - raise SyntaxError,"lex: no rules of the form t_rulename are defined." - - # Add all of the rules defined with actions first - for f in fsymbols: - - line = f.func_code.co_firstlineno - file = f.func_code.co_filename - files[file] = None - - ismethod = isinstance(f, types.MethodType) - - if not optimize: - nargs = f.func_code.co_argcount - if ismethod: - reqargs = 2 - else: - reqargs = 1 - if nargs > reqargs: - print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__) - error = 1 - continue - - if nargs < reqargs: - print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__) - error = 1 - continue - - if f.__name__ == 't_ignore': - print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__) - error = 1 - continue - - if f.__name__ == 't_error': - lexer.lexerrorf = f - continue - - if f.__doc__: - if not optimize: - try: - c = re.compile(f.__doc__, re.VERBOSE) - except re.error,e: - print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e) - error = 1 - continue - - if debug: - print "lex: Adding rule %s -> '%s'" % (f.__name__,f.__doc__) - - # Okay. The regular expression seemed okay. Let's append it to the master regular - # expression we're building - - if (regex): regex += "|" - regex += "(?P<%s>%s)" % (f.__name__,f.__doc__) - else: - print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__) - - # Now add all of the simple rules - for name,r in ssymbols: - - if name == 't_ignore': - lexer.lexignore = r - continue - - if not optimize: - if name == 't_error': - raise SyntaxError,"lex: Rule 't_error' must be defined as a function" - error = 1 - continue - - if not lexer.lextokens.has_key(name[2:]): - print "lex: Rule '%s' defined for an unspecified token %s." % (name,name[2:]) - error = 1 - continue - try: - c = re.compile(r,re.VERBOSE) - except re.error,e: - print "lex: Invalid regular expression for rule '%s'. %s" % (name,e) - error = 1 - continue - if debug: - print "lex: Adding rule %s -> '%s'" % (name,r) - - if regex: regex += "|" - regex += "(?P<%s>%s)" % (name,r) - - if not optimize: - for f in files.keys(): - if not validate_file(f): - error = 1 - try: - if debug: - print "lex: regex = '%s'" % regex - lexer.lexre = re.compile(regex, re.VERBOSE) - - # Build the index to function map for the matching engine - lexer.lexindexfunc = [ None ] * (max(lexer.lexre.groupindex.values())+1) - for f,i in lexer.lexre.groupindex.items(): - handle = ldict[f] - if type(handle) in (types.FunctionType, types.MethodType): - lexer.lexindexfunc[i] = (handle,handle.__name__[2:]) - else: - # If rule was specified as a string, we build an anonymous - # callback function to carry out the action - lexer.lexindexfunc[i] = (None,f[2:]) - - # If a lextab was specified, we create a file containing the precomputed - # regular expression and index table - - if lextab and optimize: - lt = open(lextab+".py","w") - lt.write("# %s.py. This file automatically created by PLY. Don't edit.\n" % lextab) - lt.write("_lexre = %s\n" % repr(regex)) - lt.write("_lextab = [\n"); - for i in range(0,len(lexer.lexindexfunc)): - t = lexer.lexindexfunc[i] - if t: - if t[0]: - lt.write(" ('%s',%s),\n"% (t[0].__name__, repr(t[1]))) - else: - lt.write(" (None,%s),\n" % repr(t[1])) - else: - lt.write(" None,\n") - - lt.write("]\n"); - lt.write("_lextokens = %s\n" % repr(lexer.lextokens)) - lt.write("_lexignore = %s\n" % repr(lexer.lexignore)) - if (lexer.lexerrorf): - lt.write("_lexerrorf = %s\n" % repr(lexer.lexerrorf.__name__)) - else: - lt.write("_lexerrorf = None\n") - lt.close() - - except re.error,e: - print "lex: Fatal error. Unable to compile regular expression rules. %s" % e - error = 1 - if error: - raise SyntaxError,"lex: Unable to build lexer." - if not lexer.lexerrorf: - print "lex: Warning. no t_error rule is defined." - - if not lexer.lexignore: lexer.lexignore = "" - - # Create global versions of the token() and input() functions - token = lexer.token - input = lexer.input - - return lexer - -# ----------------------------------------------------------------------------- -# run() -# -# This runs the lexer as a main program -# ----------------------------------------------------------------------------- - -def runmain(lexer=None,data=None): - if not data: - try: - filename = sys.argv[1] - f = open(filename) - data = f.read() - f.close() - except IndexError: - print "Reading from standard input (type EOF to end):" - data = sys.stdin.read() - - if lexer: - _input = lexer.input - else: - _input = input - _input(data) - if lexer: - _token = lexer.token - else: - _token = token - - while 1: - tok = _token() - if not tok: break - print "(%s,'%s',%d)" % (tok.type, tok.value, tok.lineno) - - - - - diff --git a/modules/elmsubmit/lib/lextab.py b/modules/elmsubmit/lib/lextab.py index e2b1d6d44..c3a463fc5 100644 --- a/modules/elmsubmit/lib/lextab.py +++ b/modules/elmsubmit/lib/lextab.py @@ -1,14 +1,14 @@ -# lextab.py. This file automatically created by PLY. Don't edit. +# lextab.py. This file automatically created by PLY. Don't edit. _lexre = '(?P\\s*cdson:::\\n+)|(?P(?<=\\n)[\\ \\t]*_+\\w+?_+\\s*\\n+)|(?P.+?\\S+.*?(?=([\\ \\t]*_+\\w+?_+\\s*\\n|\\n\\s*cdsoff:::)))|(?P(?s)\\n\\s*cdsoff:::(\\n.*)?)' _lextab = [ None, ('t_CDSON','CDSON'), ('t_KEY','KEY'), ('t_VALUE','VALUE'), None, ('t_CDSOFF','CDSOFF'), ] _lextokens = {'VALUE': None, 'CDSON': None, 'CDSOFF': None, 'KEY': None} _lexignore = None _lexerrorf = 't_error' - + diff --git a/modules/elmsubmit/lib/lextab.py.wml b/modules/elmsubmit/lib/lextab.py.wml deleted file mode 100644 index e2b1d6d44..000000000 --- a/modules/elmsubmit/lib/lextab.py.wml +++ /dev/null @@ -1,14 +0,0 @@ -# lextab.py. This file automatically created by PLY. Don't edit. -_lexre = '(?P\\s*cdson:::\\n+)|(?P(?<=\\n)[\\ \\t]*_+\\w+?_+\\s*\\n+)|(?P.+?\\S+.*?(?=([\\ \\t]*_+\\w+?_+\\s*\\n|\\n\\s*cdsoff:::)))|(?P(?s)\\n\\s*cdsoff:::(\\n.*)?)' -_lextab = [ - None, - ('t_CDSON','CDSON'), - ('t_KEY','KEY'), - ('t_VALUE','VALUE'), - None, - ('t_CDSOFF','CDSOFF'), -] -_lextokens = {'VALUE': None, 'CDSON': None, 'CDSOFF': None, 'KEY': None} -_lexignore = None -_lexerrorf = 't_error' - diff --git a/modules/elmsubmit/lib/magic/.cvsignore b/modules/elmsubmit/lib/magic/.cvsignore index c5cd3a479..9638520ce 100644 --- a/modules/elmsubmit/lib/magic/.cvsignore +++ b/modules/elmsubmit/lib/magic/.cvsignore @@ -1,7 +1,6 @@ Makefile Makefile.in z_* *.O *~ -*.py *.pyc \ No newline at end of file diff --git a/modules/elmsubmit/lib/magic/Makefile.am b/modules/elmsubmit/lib/magic/Makefile.am index 691d40dc4..72dc42a11 100644 --- a/modules/elmsubmit/lib/magic/Makefile.am +++ b/modules/elmsubmit/lib/magic/Makefile.am @@ -1,29 +1,26 @@ ## $Id$ ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. pylibdir = $(libdir)/python/cdsware/magic -pylib_DATA = __init__.py compile_magic.py magic.ext magic.ext.mgc -FILESWML = $(wildcard $(srcdir)/*.wml) -EXTRA_DIST = $(FILESWML:$(srcdir)/%=%) magic.ext magic.ext.mgc +pylib_DATA = __init__.py compile_magic.py magic.ext magic.ext.mgc -CLEANFILES = *.py *~ *.tmp *.pyc +EXTRA_DIST = $(pylib_DATA) -%.py: %.py.wml $(top_srcdir)/config/config.wml $(top_builddir)/config/configbis.wml - $(WML) -o $@ $< +CLEANFILES = *~ *.tmp *.pyc diff --git a/modules/elmsubmit/lib/magic/__init__.py.wml b/modules/elmsubmit/lib/magic/__init__.py.wml deleted file mode 100644 index e69de29bb..000000000 diff --git a/modules/elmsubmit/lib/magic/compile_magic.py b/modules/elmsubmit/lib/magic/compile_magic.py index 6013b6327..cb06b706e 100644 --- a/modules/elmsubmit/lib/magic/compile_magic.py +++ b/modules/elmsubmit/lib/magic/compile_magic.py @@ -1,45 +1,41 @@ -# -*- coding: utf-8 -*- - -## $Id$ - +# -*- coding: utf-8 -*- +#! /usr/bin/env python +## +## $Id$ +## ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002, 2003, 2004, 2005 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - -#! /usr/bin/env python - """ Compile magic files listed on the command line. Print name of compiled file. """ import sys import os import magic.magic as magic magician = magic.open(magic.MAGIC_NONE) for filename in sys.argv[1:]: if os.path.isdir(filename): print filename, "is Directory!" continue if magician.compile(filename) == 0: print filename, "compiled OK." else: print filename, "failed to compile." - + diff --git a/modules/elmsubmit/lib/magic/compile_magic.py.wml b/modules/elmsubmit/lib/magic/compile_magic.py.wml deleted file mode 100644 index 6013b6327..000000000 --- a/modules/elmsubmit/lib/magic/compile_magic.py.wml +++ /dev/null @@ -1,45 +0,0 @@ -# -*- coding: utf-8 -*- - -## $Id$ - -## This file is part of the CERN Document Server Software (CDSware). -## Copyright (C) 2002, 2003, 2004, 2005 CERN. -## -## The CDSware is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License as -## published by the Free Software Foundation; either version 2 of the -## License, or (at your option) any later version. -## -## The CDSware is distributed in the hope that it will be useful, but -## WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -## General Public License for more details. -## -## You should have received a copy of the GNU General Public License -## along with CDSware; if not, write to the Free Software Foundation, Inc., -## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. - - -#! /usr/bin/env python - -""" -Compile magic files listed on the command line. Print name of compiled file. -""" - -import sys -import os -import magic.magic as magic - -magician = magic.open(magic.MAGIC_NONE) - -for filename in sys.argv[1:]: - if os.path.isdir(filename): - print filename, "is Directory!" - continue - if magician.compile(filename) == 0: - print filename, "compiled OK." - else: - print filename, "failed to compile." - diff --git a/modules/elmsubmit/lib/myhtmlentitydefs.py b/modules/elmsubmit/lib/myhtmlentitydefs.py index f108bb329..3eceb72dc 100644 --- a/modules/elmsubmit/lib/myhtmlentitydefs.py +++ b/modules/elmsubmit/lib/myhtmlentitydefs.py @@ -1,274 +1,274 @@ -"""HTML character entity references.""" +"""HTML character entity references.""" # maps the HTML entity name to the Unicode codepoint name2codepoint = { 'AElig': 0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 'Aacute': 0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1 'Acirc': 0x00c2, # latin capital letter A with circumflex, U+00C2 ISOlat1 'Agrave': 0x00c0, # latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 'Alpha': 0x0391, # greek capital letter alpha, U+0391 'Aring': 0x00c5, # latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 'Atilde': 0x00c3, # latin capital letter A with tilde, U+00C3 ISOlat1 'Auml': 0x00c4, # latin capital letter A with diaeresis, U+00C4 ISOlat1 'Beta': 0x0392, # greek capital letter beta, U+0392 'Ccedil': 0x00c7, # latin capital letter C with cedilla, U+00C7 ISOlat1 'Chi': 0x03a7, # greek capital letter chi, U+03A7 'Dagger': 0x2021, # double dagger, U+2021 ISOpub 'Delta': 0x0394, # greek capital letter delta, U+0394 ISOgrk3 'ETH': 0x00d0, # latin capital letter ETH, U+00D0 ISOlat1 'Eacute': 0x00c9, # latin capital letter E with acute, U+00C9 ISOlat1 'Ecirc': 0x00ca, # latin capital letter E with circumflex, U+00CA ISOlat1 'Egrave': 0x00c8, # latin capital letter E with grave, U+00C8 ISOlat1 'Epsilon': 0x0395, # greek capital letter epsilon, U+0395 'Eta': 0x0397, # greek capital letter eta, U+0397 'Euml': 0x00cb, # latin capital letter E with diaeresis, U+00CB ISOlat1 'Gamma': 0x0393, # greek capital letter gamma, U+0393 ISOgrk3 'Iacute': 0x00cd, # latin capital letter I with acute, U+00CD ISOlat1 'Icirc': 0x00ce, # latin capital letter I with circumflex, U+00CE ISOlat1 'Igrave': 0x00cc, # latin capital letter I with grave, U+00CC ISOlat1 'Iota': 0x0399, # greek capital letter iota, U+0399 'Iuml': 0x00cf, # latin capital letter I with diaeresis, U+00CF ISOlat1 'Kappa': 0x039a, # greek capital letter kappa, U+039A 'Lambda': 0x039b, # greek capital letter lambda, U+039B ISOgrk3 'Mu': 0x039c, # greek capital letter mu, U+039C 'Ntilde': 0x00d1, # latin capital letter N with tilde, U+00D1 ISOlat1 'Nu': 0x039d, # greek capital letter nu, U+039D 'OElig': 0x0152, # latin capital ligature OE, U+0152 ISOlat2 'Oacute': 0x00d3, # latin capital letter O with acute, U+00D3 ISOlat1 'Ocirc': 0x00d4, # latin capital letter O with circumflex, U+00D4 ISOlat1 'Ograve': 0x00d2, # latin capital letter O with grave, U+00D2 ISOlat1 'Omega': 0x03a9, # greek capital letter omega, U+03A9 ISOgrk3 'Omicron': 0x039f, # greek capital letter omicron, U+039F 'Oslash': 0x00d8, # latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1 'Otilde': 0x00d5, # latin capital letter O with tilde, U+00D5 ISOlat1 'Ouml': 0x00d6, # latin capital letter O with diaeresis, U+00D6 ISOlat1 'Phi': 0x03a6, # greek capital letter phi, U+03A6 ISOgrk3 'Pi': 0x03a0, # greek capital letter pi, U+03A0 ISOgrk3 'Prime': 0x2033, # double prime = seconds = inches, U+2033 ISOtech 'Psi': 0x03a8, # greek capital letter psi, U+03A8 ISOgrk3 'Rho': 0x03a1, # greek capital letter rho, U+03A1 'Scaron': 0x0160, # latin capital letter S with caron, U+0160 ISOlat2 'Sigma': 0x03a3, # greek capital letter sigma, U+03A3 ISOgrk3 'THORN': 0x00de, # latin capital letter THORN, U+00DE ISOlat1 'Tau': 0x03a4, # greek capital letter tau, U+03A4 'Theta': 0x0398, # greek capital letter theta, U+0398 ISOgrk3 'Uacute': 0x00da, # latin capital letter U with acute, U+00DA ISOlat1 'Ucirc': 0x00db, # latin capital letter U with circumflex, U+00DB ISOlat1 'Ugrave': 0x00d9, # latin capital letter U with grave, U+00D9 ISOlat1 'Upsilon': 0x03a5, # greek capital letter upsilon, U+03A5 ISOgrk3 'Uuml': 0x00dc, # latin capital letter U with diaeresis, U+00DC ISOlat1 'Xi': 0x039e, # greek capital letter xi, U+039E ISOgrk3 'Yacute': 0x00dd, # latin capital letter Y with acute, U+00DD ISOlat1 'Yuml': 0x0178, # latin capital letter Y with diaeresis, U+0178 ISOlat2 'Zeta': 0x0396, # greek capital letter zeta, U+0396 'aacute': 0x00e1, # latin small letter a with acute, U+00E1 ISOlat1 'acirc': 0x00e2, # latin small letter a with circumflex, U+00E2 ISOlat1 'acute': 0x00b4, # acute accent = spacing acute, U+00B4 ISOdia 'aelig': 0x00e6, # latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 'agrave': 0x00e0, # latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 'alefsym': 0x2135, # alef symbol = first transfinite cardinal, U+2135 NEW 'alpha': 0x03b1, # greek small letter alpha, U+03B1 ISOgrk3 'amp': 0x0026, # ampersand, U+0026 ISOnum 'and': 0x2227, # logical and = wedge, U+2227 ISOtech 'ang': 0x2220, # angle, U+2220 ISOamso 'aring': 0x00e5, # latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 'asymp': 0x2248, # almost equal to = asymptotic to, U+2248 ISOamsr 'atilde': 0x00e3, # latin small letter a with tilde, U+00E3 ISOlat1 'auml': 0x00e4, # latin small letter a with diaeresis, U+00E4 ISOlat1 'bdquo': 0x201e, # double low-9 quotation mark, U+201E NEW 'beta': 0x03b2, # greek small letter beta, U+03B2 ISOgrk3 'brvbar': 0x00a6, # broken bar = broken vertical bar, U+00A6 ISOnum 'bull': 0x2022, # bullet = black small circle, U+2022 ISOpub 'cap': 0x2229, # intersection = cap, U+2229 ISOtech 'ccedil': 0x00e7, # latin small letter c with cedilla, U+00E7 ISOlat1 'cedil': 0x00b8, # cedilla = spacing cedilla, U+00B8 ISOdia 'cent': 0x00a2, # cent sign, U+00A2 ISOnum 'chi': 0x03c7, # greek small letter chi, U+03C7 ISOgrk3 'circ': 0x02c6, # modifier letter circumflex accent, U+02C6 ISOpub 'clubs': 0x2663, # black club suit = shamrock, U+2663 ISOpub 'cong': 0x2245, # approximately equal to, U+2245 ISOtech 'copy': 0x00a9, # copyright sign, U+00A9 ISOnum 'crarr': 0x21b5, # downwards arrow with corner leftwards = carriage return, U+21B5 NEW 'cup': 0x222a, # union = cup, U+222A ISOtech 'curren': 0x00a4, # currency sign, U+00A4 ISOnum 'dArr': 0x21d3, # downwards double arrow, U+21D3 ISOamsa 'dagger': 0x2020, # dagger, U+2020 ISOpub 'darr': 0x2193, # downwards arrow, U+2193 ISOnum 'deg': 0x00b0, # degree sign, U+00B0 ISOnum 'delta': 0x03b4, # greek small letter delta, U+03B4 ISOgrk3 'diams': 0x2666, # black diamond suit, U+2666 ISOpub 'divide': 0x00f7, # division sign, U+00F7 ISOnum 'eacute': 0x00e9, # latin small letter e with acute, U+00E9 ISOlat1 'ecirc': 0x00ea, # latin small letter e with circumflex, U+00EA ISOlat1 'egrave': 0x00e8, # latin small letter e with grave, U+00E8 ISOlat1 'empty': 0x2205, # empty set = null set = diameter, U+2205 ISOamso 'emsp': 0x2003, # em space, U+2003 ISOpub 'ensp': 0x2002, # en space, U+2002 ISOpub 'epsilon': 0x03b5, # greek small letter epsilon, U+03B5 ISOgrk3 'equiv': 0x2261, # identical to, U+2261 ISOtech 'eta': 0x03b7, # greek small letter eta, U+03B7 ISOgrk3 'eth': 0x00f0, # latin small letter eth, U+00F0 ISOlat1 'euml': 0x00eb, # latin small letter e with diaeresis, U+00EB ISOlat1 'euro': 0x20ac, # euro sign, U+20AC NEW 'exist': 0x2203, # there exists, U+2203 ISOtech 'fnof': 0x0192, # latin small f with hook = function = florin, U+0192 ISOtech 'forall': 0x2200, # for all, U+2200 ISOtech 'frac12': 0x00bd, # vulgar fraction one half = fraction one half, U+00BD ISOnum 'frac14': 0x00bc, # vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum 'frac34': 0x00be, # vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum 'frasl': 0x2044, # fraction slash, U+2044 NEW 'gamma': 0x03b3, # greek small letter gamma, U+03B3 ISOgrk3 'ge': 0x2265, # greater-than or equal to, U+2265 ISOtech 'gt': 0x003e, # greater-than sign, U+003E ISOnum 'hArr': 0x21d4, # left right double arrow, U+21D4 ISOamsa 'harr': 0x2194, # left right arrow, U+2194 ISOamsa 'hearts': 0x2665, # black heart suit = valentine, U+2665 ISOpub 'hellip': 0x2026, # horizontal ellipsis = three dot leader, U+2026 ISOpub 'iacute': 0x00ed, # latin small letter i with acute, U+00ED ISOlat1 'icirc': 0x00ee, # latin small letter i with circumflex, U+00EE ISOlat1 'iexcl': 0x00a1, # inverted exclamation mark, U+00A1 ISOnum 'igrave': 0x00ec, # latin small letter i with grave, U+00EC ISOlat1 'image': 0x2111, # blackletter capital I = imaginary part, U+2111 ISOamso 'infin': 0x221e, # infinity, U+221E ISOtech 'int': 0x222b, # integral, U+222B ISOtech 'iota': 0x03b9, # greek small letter iota, U+03B9 ISOgrk3 'iquest': 0x00bf, # inverted question mark = turned question mark, U+00BF ISOnum 'isin': 0x2208, # element of, U+2208 ISOtech 'iuml': 0x00ef, # latin small letter i with diaeresis, U+00EF ISOlat1 'kappa': 0x03ba, # greek small letter kappa, U+03BA ISOgrk3 'lArr': 0x21d0, # leftwards double arrow, U+21D0 ISOtech 'lambda': 0x03bb, # greek small letter lambda, U+03BB ISOgrk3 'lang': 0x2329, # left-pointing angle bracket = bra, U+2329 ISOtech 'laquo': 0x00ab, # left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum 'larr': 0x2190, # leftwards arrow, U+2190 ISOnum 'lceil': 0x2308, # left ceiling = apl upstile, U+2308 ISOamsc 'ldquo': 0x201c, # left double quotation mark, U+201C ISOnum 'le': 0x2264, # less-than or equal to, U+2264 ISOtech 'lfloor': 0x230a, # left floor = apl downstile, U+230A ISOamsc 'lowast': 0x2217, # asterisk operator, U+2217 ISOtech 'loz': 0x25ca, # lozenge, U+25CA ISOpub 'lrm': 0x200e, # left-to-right mark, U+200E NEW RFC 2070 'lsaquo': 0x2039, # single left-pointing angle quotation mark, U+2039 ISO proposed 'lsquo': 0x2018, # left single quotation mark, U+2018 ISOnum 'lt': 0x003c, # less-than sign, U+003C ISOnum 'macr': 0x00af, # macron = spacing macron = overline = APL overbar, U+00AF ISOdia 'mdash': 0x2014, # em dash, U+2014 ISOpub 'micro': 0x00b5, # micro sign, U+00B5 ISOnum 'middot': 0x00b7, # middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum 'minus': 0x2212, # minus sign, U+2212 ISOtech 'mu': 0x03bc, # greek small letter mu, U+03BC ISOgrk3 'nabla': 0x2207, # nabla = backward difference, U+2207 ISOtech 'nbsp': 0x00a0, # no-break space = non-breaking space, U+00A0 ISOnum 'ndash': 0x2013, # en dash, U+2013 ISOpub 'ne': 0x2260, # not equal to, U+2260 ISOtech 'ni': 0x220b, # contains as member, U+220B ISOtech 'not': 0x00ac, # not sign, U+00AC ISOnum 'notin': 0x2209, # not an element of, U+2209 ISOtech 'nsub': 0x2284, # not a subset of, U+2284 ISOamsn 'ntilde': 0x00f1, # latin small letter n with tilde, U+00F1 ISOlat1 'nu': 0x03bd, # greek small letter nu, U+03BD ISOgrk3 'oacute': 0x00f3, # latin small letter o with acute, U+00F3 ISOlat1 'ocirc': 0x00f4, # latin small letter o with circumflex, U+00F4 ISOlat1 'oelig': 0x0153, # latin small ligature oe, U+0153 ISOlat2 'ograve': 0x00f2, # latin small letter o with grave, U+00F2 ISOlat1 'oline': 0x203e, # overline = spacing overscore, U+203E NEW 'omega': 0x03c9, # greek small letter omega, U+03C9 ISOgrk3 'omicron': 0x03bf, # greek small letter omicron, U+03BF NEW 'oplus': 0x2295, # circled plus = direct sum, U+2295 ISOamsb 'or': 0x2228, # logical or = vee, U+2228 ISOtech 'ordf': 0x00aa, # feminine ordinal indicator, U+00AA ISOnum 'ordm': 0x00ba, # masculine ordinal indicator, U+00BA ISOnum 'oslash': 0x00f8, # latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 'otilde': 0x00f5, # latin small letter o with tilde, U+00F5 ISOlat1 'otimes': 0x2297, # circled times = vector product, U+2297 ISOamsb 'ouml': 0x00f6, # latin small letter o with diaeresis, U+00F6 ISOlat1 'para': 0x00b6, # pilcrow sign = paragraph sign, U+00B6 ISOnum 'part': 0x2202, # partial differential, U+2202 ISOtech 'permil': 0x2030, # per mille sign, U+2030 ISOtech 'perp': 0x22a5, # up tack = orthogonal to = perpendicular, U+22A5 ISOtech 'phi': 0x03c6, # greek small letter phi, U+03C6 ISOgrk3 'pi': 0x03c0, # greek small letter pi, U+03C0 ISOgrk3 'piv': 0x03d6, # greek pi symbol, U+03D6 ISOgrk3 'plusmn': 0x00b1, # plus-minus sign = plus-or-minus sign, U+00B1 ISOnum 'pound': 0x00a3, # pound sign, U+00A3 ISOnum 'prime': 0x2032, # prime = minutes = feet, U+2032 ISOtech 'prod': 0x220f, # n-ary product = product sign, U+220F ISOamsb 'prop': 0x221d, # proportional to, U+221D ISOtech 'psi': 0x03c8, # greek small letter psi, U+03C8 ISOgrk3 'quot': 0x0022, # quotation mark = APL quote, U+0022 ISOnum 'rArr': 0x21d2, # rightwards double arrow, U+21D2 ISOtech 'radic': 0x221a, # square root = radical sign, U+221A ISOtech 'rang': 0x232a, # right-pointing angle bracket = ket, U+232A ISOtech 'raquo': 0x00bb, # right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum 'rarr': 0x2192, # rightwards arrow, U+2192 ISOnum 'rceil': 0x2309, # right ceiling, U+2309 ISOamsc 'rdquo': 0x201d, # right double quotation mark, U+201D ISOnum 'real': 0x211c, # blackletter capital R = real part symbol, U+211C ISOamso 'reg': 0x00ae, # registered sign = registered trade mark sign, U+00AE ISOnum 'rfloor': 0x230b, # right floor, U+230B ISOamsc 'rho': 0x03c1, # greek small letter rho, U+03C1 ISOgrk3 'rlm': 0x200f, # right-to-left mark, U+200F NEW RFC 2070 'rsaquo': 0x203a, # single right-pointing angle quotation mark, U+203A ISO proposed 'rsquo': 0x2019, # right single quotation mark, U+2019 ISOnum 'sbquo': 0x201a, # single low-9 quotation mark, U+201A NEW 'scaron': 0x0161, # latin small letter s with caron, U+0161 ISOlat2 'sdot': 0x22c5, # dot operator, U+22C5 ISOamsb 'sect': 0x00a7, # section sign, U+00A7 ISOnum 'shy': 0x00ad, # soft hyphen = discretionary hyphen, U+00AD ISOnum 'sigma': 0x03c3, # greek small letter sigma, U+03C3 ISOgrk3 'sigmaf': 0x03c2, # greek small letter final sigma, U+03C2 ISOgrk3 'sim': 0x223c, # tilde operator = varies with = similar to, U+223C ISOtech 'spades': 0x2660, # black spade suit, U+2660 ISOpub 'sub': 0x2282, # subset of, U+2282 ISOtech 'sube': 0x2286, # subset of or equal to, U+2286 ISOtech 'sum': 0x2211, # n-ary sumation, U+2211 ISOamsb 'sup': 0x2283, # superset of, U+2283 ISOtech 'sup1': 0x00b9, # superscript one = superscript digit one, U+00B9 ISOnum 'sup2': 0x00b2, # superscript two = superscript digit two = squared, U+00B2 ISOnum 'sup3': 0x00b3, # superscript three = superscript digit three = cubed, U+00B3 ISOnum 'supe': 0x2287, # superset of or equal to, U+2287 ISOtech 'szlig': 0x00df, # latin small letter sharp s = ess-zed, U+00DF ISOlat1 'tau': 0x03c4, # greek small letter tau, U+03C4 ISOgrk3 'there4': 0x2234, # therefore, U+2234 ISOtech 'theta': 0x03b8, # greek small letter theta, U+03B8 ISOgrk3 'thetasym': 0x03d1, # greek small letter theta symbol, U+03D1 NEW 'thinsp': 0x2009, # thin space, U+2009 ISOpub 'thorn': 0x00fe, # latin small letter thorn with, U+00FE ISOlat1 'tilde': 0x02dc, # small tilde, U+02DC ISOdia 'times': 0x00d7, # multiplication sign, U+00D7 ISOnum 'trade': 0x2122, # trade mark sign, U+2122 ISOnum 'uArr': 0x21d1, # upwards double arrow, U+21D1 ISOamsa 'uacute': 0x00fa, # latin small letter u with acute, U+00FA ISOlat1 'uarr': 0x2191, # upwards arrow, U+2191 ISOnum 'ucirc': 0x00fb, # latin small letter u with circumflex, U+00FB ISOlat1 'ugrave': 0x00f9, # latin small letter u with grave, U+00F9 ISOlat1 'uml': 0x00a8, # diaeresis = spacing diaeresis, U+00A8 ISOdia 'upsih': 0x03d2, # greek upsilon with hook symbol, U+03D2 NEW 'upsilon': 0x03c5, # greek small letter upsilon, U+03C5 ISOgrk3 'uuml': 0x00fc, # latin small letter u with diaeresis, U+00FC ISOlat1 'weierp': 0x2118, # script capital P = power set = Weierstrass p, U+2118 ISOamso 'xi': 0x03be, # greek small letter xi, U+03BE ISOgrk3 'yacute': 0x00fd, # latin small letter y with acute, U+00FD ISOlat1 'yen': 0x00a5, # yen sign = yuan sign, U+00A5 ISOnum 'yuml': 0x00ff, # latin small letter y with diaeresis, U+00FF ISOlat1 'zeta': 0x03b6, # greek small letter zeta, U+03B6 ISOgrk3 'zwj': 0x200d, # zero width joiner, U+200D NEW RFC 2070 'zwnj': 0x200c, # zero width non-joiner, U+200C NEW RFC 2070 } # maps the Unicode codepoint to the HTML entity name codepoint2name = {} # maps the HTML entity name to the character # (or a character reference if the character is outside the Latin-1 range) entitydefs = {} for (name, codepoint) in name2codepoint.iteritems(): codepoint2name[codepoint] = name if codepoint <= 0xff: entitydefs[name] = unichr(codepoint) else: entitydefs[name] = '&#%d;' % codepoint del name, codepoint - + diff --git a/modules/elmsubmit/lib/myhtmlentitydefs.py.wml b/modules/elmsubmit/lib/myhtmlentitydefs.py.wml deleted file mode 100644 index f108bb329..000000000 --- a/modules/elmsubmit/lib/myhtmlentitydefs.py.wml +++ /dev/null @@ -1,274 +0,0 @@ -"""HTML character entity references.""" - -# maps the HTML entity name to the Unicode codepoint -name2codepoint = { - 'AElig': 0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 - 'Aacute': 0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1 - 'Acirc': 0x00c2, # latin capital letter A with circumflex, U+00C2 ISOlat1 - 'Agrave': 0x00c0, # latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 - 'Alpha': 0x0391, # greek capital letter alpha, U+0391 - 'Aring': 0x00c5, # latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 - 'Atilde': 0x00c3, # latin capital letter A with tilde, U+00C3 ISOlat1 - 'Auml': 0x00c4, # latin capital letter A with diaeresis, U+00C4 ISOlat1 - 'Beta': 0x0392, # greek capital letter beta, U+0392 - 'Ccedil': 0x00c7, # latin capital letter C with cedilla, U+00C7 ISOlat1 - 'Chi': 0x03a7, # greek capital letter chi, U+03A7 - 'Dagger': 0x2021, # double dagger, U+2021 ISOpub - 'Delta': 0x0394, # greek capital letter delta, U+0394 ISOgrk3 - 'ETH': 0x00d0, # latin capital letter ETH, U+00D0 ISOlat1 - 'Eacute': 0x00c9, # latin capital letter E with acute, U+00C9 ISOlat1 - 'Ecirc': 0x00ca, # latin capital letter E with circumflex, U+00CA ISOlat1 - 'Egrave': 0x00c8, # latin capital letter E with grave, U+00C8 ISOlat1 - 'Epsilon': 0x0395, # greek capital letter epsilon, U+0395 - 'Eta': 0x0397, # greek capital letter eta, U+0397 - 'Euml': 0x00cb, # latin capital letter E with diaeresis, U+00CB ISOlat1 - 'Gamma': 0x0393, # greek capital letter gamma, U+0393 ISOgrk3 - 'Iacute': 0x00cd, # latin capital letter I with acute, U+00CD ISOlat1 - 'Icirc': 0x00ce, # latin capital letter I with circumflex, U+00CE ISOlat1 - 'Igrave': 0x00cc, # latin capital letter I with grave, U+00CC ISOlat1 - 'Iota': 0x0399, # greek capital letter iota, U+0399 - 'Iuml': 0x00cf, # latin capital letter I with diaeresis, U+00CF ISOlat1 - 'Kappa': 0x039a, # greek capital letter kappa, U+039A - 'Lambda': 0x039b, # greek capital letter lambda, U+039B ISOgrk3 - 'Mu': 0x039c, # greek capital letter mu, U+039C - 'Ntilde': 0x00d1, # latin capital letter N with tilde, U+00D1 ISOlat1 - 'Nu': 0x039d, # greek capital letter nu, U+039D - 'OElig': 0x0152, # latin capital ligature OE, U+0152 ISOlat2 - 'Oacute': 0x00d3, # latin capital letter O with acute, U+00D3 ISOlat1 - 'Ocirc': 0x00d4, # latin capital letter O with circumflex, U+00D4 ISOlat1 - 'Ograve': 0x00d2, # latin capital letter O with grave, U+00D2 ISOlat1 - 'Omega': 0x03a9, # greek capital letter omega, U+03A9 ISOgrk3 - 'Omicron': 0x039f, # greek capital letter omicron, U+039F - 'Oslash': 0x00d8, # latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1 - 'Otilde': 0x00d5, # latin capital letter O with tilde, U+00D5 ISOlat1 - 'Ouml': 0x00d6, # latin capital letter O with diaeresis, U+00D6 ISOlat1 - 'Phi': 0x03a6, # greek capital letter phi, U+03A6 ISOgrk3 - 'Pi': 0x03a0, # greek capital letter pi, U+03A0 ISOgrk3 - 'Prime': 0x2033, # double prime = seconds = inches, U+2033 ISOtech - 'Psi': 0x03a8, # greek capital letter psi, U+03A8 ISOgrk3 - 'Rho': 0x03a1, # greek capital letter rho, U+03A1 - 'Scaron': 0x0160, # latin capital letter S with caron, U+0160 ISOlat2 - 'Sigma': 0x03a3, # greek capital letter sigma, U+03A3 ISOgrk3 - 'THORN': 0x00de, # latin capital letter THORN, U+00DE ISOlat1 - 'Tau': 0x03a4, # greek capital letter tau, U+03A4 - 'Theta': 0x0398, # greek capital letter theta, U+0398 ISOgrk3 - 'Uacute': 0x00da, # latin capital letter U with acute, U+00DA ISOlat1 - 'Ucirc': 0x00db, # latin capital letter U with circumflex, U+00DB ISOlat1 - 'Ugrave': 0x00d9, # latin capital letter U with grave, U+00D9 ISOlat1 - 'Upsilon': 0x03a5, # greek capital letter upsilon, U+03A5 ISOgrk3 - 'Uuml': 0x00dc, # latin capital letter U with diaeresis, U+00DC ISOlat1 - 'Xi': 0x039e, # greek capital letter xi, U+039E ISOgrk3 - 'Yacute': 0x00dd, # latin capital letter Y with acute, U+00DD ISOlat1 - 'Yuml': 0x0178, # latin capital letter Y with diaeresis, U+0178 ISOlat2 - 'Zeta': 0x0396, # greek capital letter zeta, U+0396 - 'aacute': 0x00e1, # latin small letter a with acute, U+00E1 ISOlat1 - 'acirc': 0x00e2, # latin small letter a with circumflex, U+00E2 ISOlat1 - 'acute': 0x00b4, # acute accent = spacing acute, U+00B4 ISOdia - 'aelig': 0x00e6, # latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 - 'agrave': 0x00e0, # latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 - 'alefsym': 0x2135, # alef symbol = first transfinite cardinal, U+2135 NEW - 'alpha': 0x03b1, # greek small letter alpha, U+03B1 ISOgrk3 - 'amp': 0x0026, # ampersand, U+0026 ISOnum - 'and': 0x2227, # logical and = wedge, U+2227 ISOtech - 'ang': 0x2220, # angle, U+2220 ISOamso - 'aring': 0x00e5, # latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 - 'asymp': 0x2248, # almost equal to = asymptotic to, U+2248 ISOamsr - 'atilde': 0x00e3, # latin small letter a with tilde, U+00E3 ISOlat1 - 'auml': 0x00e4, # latin small letter a with diaeresis, U+00E4 ISOlat1 - 'bdquo': 0x201e, # double low-9 quotation mark, U+201E NEW - 'beta': 0x03b2, # greek small letter beta, U+03B2 ISOgrk3 - 'brvbar': 0x00a6, # broken bar = broken vertical bar, U+00A6 ISOnum - 'bull': 0x2022, # bullet = black small circle, U+2022 ISOpub - 'cap': 0x2229, # intersection = cap, U+2229 ISOtech - 'ccedil': 0x00e7, # latin small letter c with cedilla, U+00E7 ISOlat1 - 'cedil': 0x00b8, # cedilla = spacing cedilla, U+00B8 ISOdia - 'cent': 0x00a2, # cent sign, U+00A2 ISOnum - 'chi': 0x03c7, # greek small letter chi, U+03C7 ISOgrk3 - 'circ': 0x02c6, # modifier letter circumflex accent, U+02C6 ISOpub - 'clubs': 0x2663, # black club suit = shamrock, U+2663 ISOpub - 'cong': 0x2245, # approximately equal to, U+2245 ISOtech - 'copy': 0x00a9, # copyright sign, U+00A9 ISOnum - 'crarr': 0x21b5, # downwards arrow with corner leftwards = carriage return, U+21B5 NEW - 'cup': 0x222a, # union = cup, U+222A ISOtech - 'curren': 0x00a4, # currency sign, U+00A4 ISOnum - 'dArr': 0x21d3, # downwards double arrow, U+21D3 ISOamsa - 'dagger': 0x2020, # dagger, U+2020 ISOpub - 'darr': 0x2193, # downwards arrow, U+2193 ISOnum - 'deg': 0x00b0, # degree sign, U+00B0 ISOnum - 'delta': 0x03b4, # greek small letter delta, U+03B4 ISOgrk3 - 'diams': 0x2666, # black diamond suit, U+2666 ISOpub - 'divide': 0x00f7, # division sign, U+00F7 ISOnum - 'eacute': 0x00e9, # latin small letter e with acute, U+00E9 ISOlat1 - 'ecirc': 0x00ea, # latin small letter e with circumflex, U+00EA ISOlat1 - 'egrave': 0x00e8, # latin small letter e with grave, U+00E8 ISOlat1 - 'empty': 0x2205, # empty set = null set = diameter, U+2205 ISOamso - 'emsp': 0x2003, # em space, U+2003 ISOpub - 'ensp': 0x2002, # en space, U+2002 ISOpub - 'epsilon': 0x03b5, # greek small letter epsilon, U+03B5 ISOgrk3 - 'equiv': 0x2261, # identical to, U+2261 ISOtech - 'eta': 0x03b7, # greek small letter eta, U+03B7 ISOgrk3 - 'eth': 0x00f0, # latin small letter eth, U+00F0 ISOlat1 - 'euml': 0x00eb, # latin small letter e with diaeresis, U+00EB ISOlat1 - 'euro': 0x20ac, # euro sign, U+20AC NEW - 'exist': 0x2203, # there exists, U+2203 ISOtech - 'fnof': 0x0192, # latin small f with hook = function = florin, U+0192 ISOtech - 'forall': 0x2200, # for all, U+2200 ISOtech - 'frac12': 0x00bd, # vulgar fraction one half = fraction one half, U+00BD ISOnum - 'frac14': 0x00bc, # vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum - 'frac34': 0x00be, # vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum - 'frasl': 0x2044, # fraction slash, U+2044 NEW - 'gamma': 0x03b3, # greek small letter gamma, U+03B3 ISOgrk3 - 'ge': 0x2265, # greater-than or equal to, U+2265 ISOtech - 'gt': 0x003e, # greater-than sign, U+003E ISOnum - 'hArr': 0x21d4, # left right double arrow, U+21D4 ISOamsa - 'harr': 0x2194, # left right arrow, U+2194 ISOamsa - 'hearts': 0x2665, # black heart suit = valentine, U+2665 ISOpub - 'hellip': 0x2026, # horizontal ellipsis = three dot leader, U+2026 ISOpub - 'iacute': 0x00ed, # latin small letter i with acute, U+00ED ISOlat1 - 'icirc': 0x00ee, # latin small letter i with circumflex, U+00EE ISOlat1 - 'iexcl': 0x00a1, # inverted exclamation mark, U+00A1 ISOnum - 'igrave': 0x00ec, # latin small letter i with grave, U+00EC ISOlat1 - 'image': 0x2111, # blackletter capital I = imaginary part, U+2111 ISOamso - 'infin': 0x221e, # infinity, U+221E ISOtech - 'int': 0x222b, # integral, U+222B ISOtech - 'iota': 0x03b9, # greek small letter iota, U+03B9 ISOgrk3 - 'iquest': 0x00bf, # inverted question mark = turned question mark, U+00BF ISOnum - 'isin': 0x2208, # element of, U+2208 ISOtech - 'iuml': 0x00ef, # latin small letter i with diaeresis, U+00EF ISOlat1 - 'kappa': 0x03ba, # greek small letter kappa, U+03BA ISOgrk3 - 'lArr': 0x21d0, # leftwards double arrow, U+21D0 ISOtech - 'lambda': 0x03bb, # greek small letter lambda, U+03BB ISOgrk3 - 'lang': 0x2329, # left-pointing angle bracket = bra, U+2329 ISOtech - 'laquo': 0x00ab, # left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum - 'larr': 0x2190, # leftwards arrow, U+2190 ISOnum - 'lceil': 0x2308, # left ceiling = apl upstile, U+2308 ISOamsc - 'ldquo': 0x201c, # left double quotation mark, U+201C ISOnum - 'le': 0x2264, # less-than or equal to, U+2264 ISOtech - 'lfloor': 0x230a, # left floor = apl downstile, U+230A ISOamsc - 'lowast': 0x2217, # asterisk operator, U+2217 ISOtech - 'loz': 0x25ca, # lozenge, U+25CA ISOpub - 'lrm': 0x200e, # left-to-right mark, U+200E NEW RFC 2070 - 'lsaquo': 0x2039, # single left-pointing angle quotation mark, U+2039 ISO proposed - 'lsquo': 0x2018, # left single quotation mark, U+2018 ISOnum - 'lt': 0x003c, # less-than sign, U+003C ISOnum - 'macr': 0x00af, # macron = spacing macron = overline = APL overbar, U+00AF ISOdia - 'mdash': 0x2014, # em dash, U+2014 ISOpub - 'micro': 0x00b5, # micro sign, U+00B5 ISOnum - 'middot': 0x00b7, # middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum - 'minus': 0x2212, # minus sign, U+2212 ISOtech - 'mu': 0x03bc, # greek small letter mu, U+03BC ISOgrk3 - 'nabla': 0x2207, # nabla = backward difference, U+2207 ISOtech - 'nbsp': 0x00a0, # no-break space = non-breaking space, U+00A0 ISOnum - 'ndash': 0x2013, # en dash, U+2013 ISOpub - 'ne': 0x2260, # not equal to, U+2260 ISOtech - 'ni': 0x220b, # contains as member, U+220B ISOtech - 'not': 0x00ac, # not sign, U+00AC ISOnum - 'notin': 0x2209, # not an element of, U+2209 ISOtech - 'nsub': 0x2284, # not a subset of, U+2284 ISOamsn - 'ntilde': 0x00f1, # latin small letter n with tilde, U+00F1 ISOlat1 - 'nu': 0x03bd, # greek small letter nu, U+03BD ISOgrk3 - 'oacute': 0x00f3, # latin small letter o with acute, U+00F3 ISOlat1 - 'ocirc': 0x00f4, # latin small letter o with circumflex, U+00F4 ISOlat1 - 'oelig': 0x0153, # latin small ligature oe, U+0153 ISOlat2 - 'ograve': 0x00f2, # latin small letter o with grave, U+00F2 ISOlat1 - 'oline': 0x203e, # overline = spacing overscore, U+203E NEW - 'omega': 0x03c9, # greek small letter omega, U+03C9 ISOgrk3 - 'omicron': 0x03bf, # greek small letter omicron, U+03BF NEW - 'oplus': 0x2295, # circled plus = direct sum, U+2295 ISOamsb - 'or': 0x2228, # logical or = vee, U+2228 ISOtech - 'ordf': 0x00aa, # feminine ordinal indicator, U+00AA ISOnum - 'ordm': 0x00ba, # masculine ordinal indicator, U+00BA ISOnum - 'oslash': 0x00f8, # latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 - 'otilde': 0x00f5, # latin small letter o with tilde, U+00F5 ISOlat1 - 'otimes': 0x2297, # circled times = vector product, U+2297 ISOamsb - 'ouml': 0x00f6, # latin small letter o with diaeresis, U+00F6 ISOlat1 - 'para': 0x00b6, # pilcrow sign = paragraph sign, U+00B6 ISOnum - 'part': 0x2202, # partial differential, U+2202 ISOtech - 'permil': 0x2030, # per mille sign, U+2030 ISOtech - 'perp': 0x22a5, # up tack = orthogonal to = perpendicular, U+22A5 ISOtech - 'phi': 0x03c6, # greek small letter phi, U+03C6 ISOgrk3 - 'pi': 0x03c0, # greek small letter pi, U+03C0 ISOgrk3 - 'piv': 0x03d6, # greek pi symbol, U+03D6 ISOgrk3 - 'plusmn': 0x00b1, # plus-minus sign = plus-or-minus sign, U+00B1 ISOnum - 'pound': 0x00a3, # pound sign, U+00A3 ISOnum - 'prime': 0x2032, # prime = minutes = feet, U+2032 ISOtech - 'prod': 0x220f, # n-ary product = product sign, U+220F ISOamsb - 'prop': 0x221d, # proportional to, U+221D ISOtech - 'psi': 0x03c8, # greek small letter psi, U+03C8 ISOgrk3 - 'quot': 0x0022, # quotation mark = APL quote, U+0022 ISOnum - 'rArr': 0x21d2, # rightwards double arrow, U+21D2 ISOtech - 'radic': 0x221a, # square root = radical sign, U+221A ISOtech - 'rang': 0x232a, # right-pointing angle bracket = ket, U+232A ISOtech - 'raquo': 0x00bb, # right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum - 'rarr': 0x2192, # rightwards arrow, U+2192 ISOnum - 'rceil': 0x2309, # right ceiling, U+2309 ISOamsc - 'rdquo': 0x201d, # right double quotation mark, U+201D ISOnum - 'real': 0x211c, # blackletter capital R = real part symbol, U+211C ISOamso - 'reg': 0x00ae, # registered sign = registered trade mark sign, U+00AE ISOnum - 'rfloor': 0x230b, # right floor, U+230B ISOamsc - 'rho': 0x03c1, # greek small letter rho, U+03C1 ISOgrk3 - 'rlm': 0x200f, # right-to-left mark, U+200F NEW RFC 2070 - 'rsaquo': 0x203a, # single right-pointing angle quotation mark, U+203A ISO proposed - 'rsquo': 0x2019, # right single quotation mark, U+2019 ISOnum - 'sbquo': 0x201a, # single low-9 quotation mark, U+201A NEW - 'scaron': 0x0161, # latin small letter s with caron, U+0161 ISOlat2 - 'sdot': 0x22c5, # dot operator, U+22C5 ISOamsb - 'sect': 0x00a7, # section sign, U+00A7 ISOnum - 'shy': 0x00ad, # soft hyphen = discretionary hyphen, U+00AD ISOnum - 'sigma': 0x03c3, # greek small letter sigma, U+03C3 ISOgrk3 - 'sigmaf': 0x03c2, # greek small letter final sigma, U+03C2 ISOgrk3 - 'sim': 0x223c, # tilde operator = varies with = similar to, U+223C ISOtech - 'spades': 0x2660, # black spade suit, U+2660 ISOpub - 'sub': 0x2282, # subset of, U+2282 ISOtech - 'sube': 0x2286, # subset of or equal to, U+2286 ISOtech - 'sum': 0x2211, # n-ary sumation, U+2211 ISOamsb - 'sup': 0x2283, # superset of, U+2283 ISOtech - 'sup1': 0x00b9, # superscript one = superscript digit one, U+00B9 ISOnum - 'sup2': 0x00b2, # superscript two = superscript digit two = squared, U+00B2 ISOnum - 'sup3': 0x00b3, # superscript three = superscript digit three = cubed, U+00B3 ISOnum - 'supe': 0x2287, # superset of or equal to, U+2287 ISOtech - 'szlig': 0x00df, # latin small letter sharp s = ess-zed, U+00DF ISOlat1 - 'tau': 0x03c4, # greek small letter tau, U+03C4 ISOgrk3 - 'there4': 0x2234, # therefore, U+2234 ISOtech - 'theta': 0x03b8, # greek small letter theta, U+03B8 ISOgrk3 - 'thetasym': 0x03d1, # greek small letter theta symbol, U+03D1 NEW - 'thinsp': 0x2009, # thin space, U+2009 ISOpub - 'thorn': 0x00fe, # latin small letter thorn with, U+00FE ISOlat1 - 'tilde': 0x02dc, # small tilde, U+02DC ISOdia - 'times': 0x00d7, # multiplication sign, U+00D7 ISOnum - 'trade': 0x2122, # trade mark sign, U+2122 ISOnum - 'uArr': 0x21d1, # upwards double arrow, U+21D1 ISOamsa - 'uacute': 0x00fa, # latin small letter u with acute, U+00FA ISOlat1 - 'uarr': 0x2191, # upwards arrow, U+2191 ISOnum - 'ucirc': 0x00fb, # latin small letter u with circumflex, U+00FB ISOlat1 - 'ugrave': 0x00f9, # latin small letter u with grave, U+00F9 ISOlat1 - 'uml': 0x00a8, # diaeresis = spacing diaeresis, U+00A8 ISOdia - 'upsih': 0x03d2, # greek upsilon with hook symbol, U+03D2 NEW - 'upsilon': 0x03c5, # greek small letter upsilon, U+03C5 ISOgrk3 - 'uuml': 0x00fc, # latin small letter u with diaeresis, U+00FC ISOlat1 - 'weierp': 0x2118, # script capital P = power set = Weierstrass p, U+2118 ISOamso - 'xi': 0x03be, # greek small letter xi, U+03BE ISOgrk3 - 'yacute': 0x00fd, # latin small letter y with acute, U+00FD ISOlat1 - 'yen': 0x00a5, # yen sign = yuan sign, U+00A5 ISOnum - 'yuml': 0x00ff, # latin small letter y with diaeresis, U+00FF ISOlat1 - 'zeta': 0x03b6, # greek small letter zeta, U+03B6 ISOgrk3 - 'zwj': 0x200d, # zero width joiner, U+200D NEW RFC 2070 - 'zwnj': 0x200c, # zero width non-joiner, U+200C NEW RFC 2070 -} - -# maps the Unicode codepoint to the HTML entity name -codepoint2name = {} - -# maps the HTML entity name to the character -# (or a character reference if the character is outside the Latin-1 range) -entitydefs = {} - -for (name, codepoint) in name2codepoint.iteritems(): - codepoint2name[codepoint] = name - if codepoint <= 0xff: - entitydefs[name] = unichr(codepoint) - else: - entitydefs[name] = '&#%d;' % codepoint - -del name, codepoint - diff --git a/modules/elmsubmit/lib/parsetab.py b/modules/elmsubmit/lib/parsetab.py index 64206e060..2f486790c 100644 --- a/modules/elmsubmit/lib/parsetab.py +++ b/modules/elmsubmit/lib/parsetab.py @@ -1,31 +1,30 @@ - # parsetab.py # This file is automatically generated. Do not edit. _lr_method = 'SLR' _lr_signature = '\xc9\x9c*\xe5=\x87\xc5Q?\xb2h\x04\x98\x90_@' _lr_action_items = {'VALUE':([4,],[7,]),'CDSON':([0,],[1,]),'$':([8,2,],[-1,0,]),'KEY':([1,3,7,],[4,4,-4,]),'CDSOFF':([7,5,3,6,],[-4,8,-2,-3,]),} _lr_action = { } for _k, _v in _lr_action_items.items(): for _x,_y in zip(_v[0],_v[1]): _lr_action[(_x,_k)] = _y del _lr_action_items _lr_goto_items = {'assignmentList':([1,3,],[5,6,]),'submission':([0,],[2,]),'assignment':([1,3,],[3,3,]),} _lr_goto = { } for _k, _v in _lr_goto_items.items(): for _x,_y in zip(_v[0],_v[1]): _lr_goto[(_x,_k)] = _y del _lr_goto_items _lr_productions = [ ("S'",1,None,None,None), ('submission',3,'p_submission','elmsubmit_submission_parser.py',54), ('assignmentList',1,'p_assignmentList','elmsubmit_submission_parser.py',57), ('assignmentList',2,'p_assignmentList','elmsubmit_submission_parser.py',58), ('assignment',2,'p_assignment','elmsubmit_submission_parser.py',61), ] - + diff --git a/modules/elmsubmit/lib/parsetab.py.wml b/modules/elmsubmit/lib/parsetab.py.wml deleted file mode 100644 index 64206e060..000000000 --- a/modules/elmsubmit/lib/parsetab.py.wml +++ /dev/null @@ -1,31 +0,0 @@ - -# parsetab.py -# This file is automatically generated. Do not edit. - -_lr_method = 'SLR' - -_lr_signature = '\xc9\x9c*\xe5=\x87\xc5Q?\xb2h\x04\x98\x90_@' - -_lr_action_items = {'VALUE':([4,],[7,]),'CDSON':([0,],[1,]),'$':([8,2,],[-1,0,]),'KEY':([1,3,7,],[4,4,-4,]),'CDSOFF':([7,5,3,6,],[-4,8,-2,-3,]),} - -_lr_action = { } -for _k, _v in _lr_action_items.items(): - for _x,_y in zip(_v[0],_v[1]): - _lr_action[(_x,_k)] = _y -del _lr_action_items - -_lr_goto_items = {'assignmentList':([1,3,],[5,6,]),'submission':([0,],[2,]),'assignment':([1,3,],[3,3,]),} - -_lr_goto = { } -for _k, _v in _lr_goto_items.items(): - for _x,_y in zip(_v[0],_v[1]): - _lr_goto[(_x,_k)] = _y -del _lr_goto_items -_lr_productions = [ - ("S'",1,None,None,None), - ('submission',3,'p_submission','elmsubmit_submission_parser.py',54), - ('assignmentList',1,'p_assignmentList','elmsubmit_submission_parser.py',57), - ('assignmentList',2,'p_assignmentList','elmsubmit_submission_parser.py',58), - ('assignment',2,'p_assignment','elmsubmit_submission_parser.py',61), -] - diff --git a/modules/elmsubmit/lib/yacc.py b/modules/elmsubmit/lib/yacc.py index 9b9ffc79e..870a46c4b 100644 --- a/modules/elmsubmit/lib/yacc.py +++ b/modules/elmsubmit/lib/yacc.py @@ -1,2418 +1,2418 @@ -#----------------------------------------------------------------------------- +#----------------------------------------------------------------------------- # ply: yacc.py # # Author(s): David M. Beazley (beazley@cs.uchicago.edu) # Department of Computer Science # University of Chicago # Chicago, IL 60637 # # Copyright (C) 2001-2004, David M. Beazley # # $Header$ # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # See the file COPYING for a complete copy of the LGPL. # # # This implements an LR parser that is constructed from grammar rules defined # as Python functions. Roughly speaking, this module is a cross between # John Aycock's Spark system and the GNU bison utility. # # The current implementation is only somewhat object-oriented. The # LR parser itself is defined in terms of an object (which allows multiple # parsers to co-exist). However, most of the variables used during table # construction are defined in terms of global variables. Users shouldn't # notice unless they are trying to define multiple parsers at the same # time using threads (in which case they should have their head examined). # # This implementation supports both SLR and LALR(1) parsing. LALR(1) # support was implemented by Elias Ioup (ezioup@alumni.uchicago.edu) # and hacked abit by Dave to run faster. # # :::::::: WARNING ::::::: # # Construction of LR parsing tables is fairly complicated and expensive. # To make this module run fast, a *LOT* of work has been put into # optimization---often at the expensive of readability and what might # consider to be good Python "coding style." Modify the code at your # own risk! # ---------------------------------------------------------------------------- __version__ = "1.5" #----------------------------------------------------------------------------- # === User configurable parameters === # # Change these to modify the default behavior of yacc (if you wish) #----------------------------------------------------------------------------- yaccdebug = 1 # Debugging mode. If set, yacc generates a # a 'parser.out' file in the current directory debug_file = 'parser.out' # Default name of the debugging file tab_module = 'parsetab' # Default name of the table module default_lr = 'SLR' # Default LR table generation method error_count = 3 # Number of symbols that must be shifted to leave recovery mode import re, types, sys, cStringIO, md5, os.path # Exception raised for yacc-related errors class YaccError(Exception): pass #----------------------------------------------------------------------------- # === LR Parsing Engine === # # The following classes are used for the LR parser itself. These are not # used during table construction and are independent of the actual LR # table generation algorithm #----------------------------------------------------------------------------- # This class is used to hold non-terminal grammar symbols during parsing. # It normally has the following attributes set: # .type = Grammar symbol type # .value = Symbol value # .lineno = Starting line number # .endlineno = Ending line number (optional, set automatically) class YaccSymbol: def __str__(self): return self.type def __repr__(self): return str(self) # This class is a wrapper around the objects actually passed to each # grammar rule. Index lookup and assignment actually assign the # .value attribute of the underlying YaccSymbol object. # The lineno() method returns the line number of a given # item (or 0 if not defined). The linespan() method returns # a tuple of (startline,endline) representing the range of lines # for a symbol. class YaccProduction: def __init__(self,s): self.slice = s self.pbstack = [] def __getitem__(self,n): return self.slice[n].value def __setitem__(self,n,v): self.slice[n].value = v def __len__(self): return len(self.slice) def lineno(self,n): return getattr(self.slice[n],"lineno",0) def linespan(self,n): startline = getattr(self.slice[n],"lineno",0) endline = getattr(self.slice[n],"endlineno",startline) return startline,endline def pushback(self,n): if n <= 0: raise ValueError, "Expected a positive value" if n > (len(self.slice)-1): raise ValueError, "Can't push %d tokens. Only %d are available." % (n,len(self.slice)-1) for i in range(0,n): self.pbstack.append(self.slice[-i-1]) # The LR Parsing engine. This is defined as a class so that multiple parsers # can exist in the same process. A user never instantiates this directly. # Instead, the global yacc() function should be used to create a suitable Parser # object. class Parser: def __init__(self,magic=None): # This is a hack to keep users from trying to instantiate a Parser # object directly. if magic != "xyzzy": raise YaccError, "Can't instantiate Parser. Use yacc() instead." # Reset internal state self.productions = None # List of productions self.errorfunc = None # Error handling function self.action = { } # LR Action table self.goto = { } # LR goto table self.require = { } # Attribute require table self.method = "Unknown LR" # Table construction method used def errok(self): self.errorcount = 0 def restart(self): del self.statestack[:] del self.symstack[:] sym = YaccSymbol() sym.type = '$' self.symstack.append(sym) self.statestack.append(0) def parse(self,input=None,lexer=None,debug=0): lookahead = None # Current lookahead symbol lookaheadstack = [ ] # Stack of lookahead symbols actions = self.action # Local reference to action table goto = self.goto # Local reference to goto table prod = self.productions # Local reference to production list pslice = YaccProduction(None) # Production object passed to grammar rules pslice.parser = self # Parser object self.errorcount = 0 # Used during error recovery # If no lexer was given, we will try to use the lex module if not lexer: import lex as lexer pslice.lexer = lexer # If input was supplied, pass to lexer if input: lexer.input(input) # Tokenize function get_token = lexer.token statestack = [ ] # Stack of parsing states self.statestack = statestack symstack = [ ] # Stack of grammar symbols self.symstack = symstack errtoken = None # Err token # The start state is assumed to be (0,$) statestack.append(0) sym = YaccSymbol() sym.type = '$' symstack.append(sym) while 1: # Get the next symbol on the input. If a lookahead symbol # is already set, we just use that. Otherwise, we'll pull # the next token off of the lookaheadstack or from the lexer if not lookahead: if not lookaheadstack: lookahead = get_token() # Get the next token else: lookahead = lookaheadstack.pop() if not lookahead: lookahead = YaccSymbol() lookahead.type = '$' if debug: errorlead = ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip() # Check the action table s = statestack[-1] ltype = lookahead.type t = actions.get((s,ltype),None) if t is not None: if t > 0: # shift a symbol on the stack if ltype == '$': # Error, end of input sys.stderr.write("yacc: Parse error. EOF\n") return statestack.append(t) if debug > 1: sys.stderr.write("%-60s shift state %s\n" % (errorlead, t)) symstack.append(lookahead) lookahead = None # Decrease error count on successful shift if self.errorcount > 0: self.errorcount -= 1 continue if t < 0: # reduce a symbol on the stack, emit a production p = prod[-t] pname = p.name plen = p.len # Get production function sym = YaccSymbol() sym.type = pname # Production name sym.value = None if debug > 1: sys.stderr.write("%-60s reduce %d\n" % (errorlead, -t)) if plen: targ = symstack[-plen-1:] targ[0] = sym try: sym.lineno = targ[1].lineno sym.endlineno = getattr(targ[-1],"endlineno",targ[-1].lineno) except AttributeError: sym.lineno = 0 del symstack[-plen:] del statestack[-plen:] else: sym.lineno = 0 targ = [ sym ] pslice.slice = targ pslice.pbstack = [] # Call the grammar rule with our special slice object p.func(pslice) # If there was a pushback, put that on the stack if pslice.pbstack: lookaheadstack.append(lookahead) for _t in pslice.pbstack: lookaheadstack.append(_t) lookahead = None symstack.append(sym) statestack.append(goto[statestack[-1],pname]) continue if t == 0: n = symstack[-1] return getattr(n,"value",None) sys.stderr.write(errorlead, "\n") if t == None: if debug: sys.stderr.write(errorlead + "\n") # We have some kind of parsing error here. To handle # this, we are going to push the current token onto # the tokenstack and replace it with an 'error' token. # If there are any synchronization rules, they may # catch it. # # In addition to pushing the error token, we call call # the user defined p_error() function if this is the # first syntax error. This function is only called if # errorcount == 0. if not self.errorcount: self.errorcount = error_count errtoken = lookahead if errtoken.type == '$': errtoken = None # End of file! if self.errorfunc: global errok,token,restart errok = self.errok # Set some special functions available in error recovery token = get_token restart = self.restart tok = self.errorfunc(errtoken) del errok, token, restart # Delete special functions if not self.errorcount: # User must have done some kind of panic # mode recovery on their own. The # returned token is the next lookahead lookahead = tok errtoken = None continue else: if errtoken: if hasattr(errtoken,"lineno"): lineno = lookahead.lineno else: lineno = 0 if lineno: sys.stderr.write("yacc: Syntax error at line %d, token=%s\n" % (lineno, errtoken.type)) else: sys.stderr.write("yacc: Syntax error, token=%s" % errtoken.type) else: sys.stderr.write("yacc: Parse error in input. EOF\n") return else: self.errorcount = error_count # case 1: the statestack only has 1 entry on it. If we're in this state, the # entire parse has been rolled back and we're completely hosed. The token is # discarded and we just keep going. if len(statestack) <= 1 and lookahead.type != '$': lookahead = None errtoken = None # Nuke the pushback stack del lookaheadstack[:] continue # case 2: the statestack has a couple of entries on it, but we're # at the end of the file. nuke the top entry and generate an error token # Start nuking entries on the stack if lookahead.type == '$': # Whoa. We're really hosed here. Bail out return if lookahead.type != 'error': sym = symstack[-1] if sym.type == 'error': # Hmmm. Error is on top of stack, we'll just nuke input # symbol and continue lookahead = None continue t = YaccSymbol() t.type = 'error' if hasattr(lookahead,"lineno"): t.lineno = lookahead.lineno t.value = lookahead lookaheadstack.append(lookahead) lookahead = t else: symstack.pop() statestack.pop() continue # Call an error function here raise RuntimeError, "yacc: internal parser error!!!\n" # ----------------------------------------------------------------------------- # === Parser Construction === # # The following functions and variables are used to implement the yacc() function # itself. This is pretty hairy stuff involving lots of error checking, # construction of LR items, kernels, and so forth. Although a lot of # this work is done using global variables, the resulting Parser object # is completely self contained--meaning that it is safe to repeatedly # call yacc() with different grammars in the same application. # ----------------------------------------------------------------------------- # ----------------------------------------------------------------------------- # validate_file() # # This function checks to see if there are duplicated p_rulename() functions # in the parser module file. Without this function, it is really easy for # users to make mistakes by cutting and pasting code fragments (and it's a real # bugger to try and figure out why the resulting parser doesn't work). Therefore, # we just do a little regular expression pattern matching of def statements # to try and detect duplicates. # ----------------------------------------------------------------------------- def validate_file(filename): base,ext = os.path.splitext(filename) if ext != '.py': return 1 # No idea. Assume it's okay. try: f = open(filename) lines = f.readlines() f.close() except IOError: return 1 # Oh well # Match def p_funcname( fre = re.compile(r'\s*def\s+(p_[a-zA-Z_0-9]*)\(') counthash = { } linen = 1 noerror = 1 for l in lines: m = fre.match(l) if m: name = m.group(1) prev = counthash.get(name) if not prev: counthash[name] = linen else: sys.stderr.write("%s:%d: Function %s redefined. Previously defined on line %d\n" % (filename,linen,name,prev)) noerror = 0 linen += 1 return noerror # This function looks for functions that might be grammar rules, but which don't have the proper p_suffix. def validate_dict(d): for n,v in d.items(): if n[0:2] == 'p_' and type(v) in (types.FunctionType, types.MethodType): continue if n[0:2] == 't_': continue if n[0:2] == 'p_': sys.stderr.write("yacc: Warning. '%s' not defined as a function\n" % n) if 1 and isinstance(v,types.FunctionType) and v.func_code.co_argcount == 1: try: doc = v.__doc__.split(" ") if doc[1] == ':': sys.stderr.write("%s:%d: Warning. Possible grammar rule '%s' defined without p_ prefix.\n" % (v.func_code.co_filename, v.func_code.co_firstlineno,n)) except StandardError: pass # ----------------------------------------------------------------------------- # === GRAMMAR FUNCTIONS === # # The following global variables and functions are used to store, manipulate, # and verify the grammar rules specified by the user. # ----------------------------------------------------------------------------- # Initialize all of the global variables used during grammar construction def initialize_vars(): global Productions, Prodnames, Prodmap, Terminals global Nonterminals, First, Follow, Precedence, LRitems global Errorfunc, Signature, Requires # LALR(1) globals global Prodempty, TReductions, NTReductions, GotoSetNum, Canonical Productions = [None] # A list of all of the productions. The first # entry is always reserved for the purpose of # building an augmented grammar Prodnames = { } # A dictionary mapping the names of nonterminals to a list of all # productions of that nonterminal. Prodmap = { } # A dictionary that is only used to detect duplicate # productions. Terminals = { } # A dictionary mapping the names of terminal symbols to a # list of the rules where they are used. Nonterminals = { } # A dictionary mapping names of nonterminals to a list # of rule numbers where they are used. First = { } # A dictionary of precomputed FIRST(x) symbols Follow = { } # A dictionary of precomputed FOLLOW(x) symbols Precedence = { } # Precedence rules for each terminal. Contains tuples of the # form ('right',level) or ('nonassoc', level) or ('left',level) LRitems = [ ] # A list of all LR items for the grammar. These are the # productions with the "dot" like E -> E . PLUS E Errorfunc = None # User defined error handler Signature = md5.new() # Digital signature of the grammar rules, precedence # and other information. Used to determined when a # parsing table needs to be regenerated. Requires = { } # Requires list # LALR(1) Initialization Prodempty = { } # A dictionary of all productions that have an empty rule # of the form P : TReductions = { } # A dictionary of precomputer reductions from # nonterminals to terminals NTReductions = { } # A dictionary of precomputed reductions from # nonterminals to nonterminals GotoSetNum = { } # A dictionary that remembers goto sets based on # the state number and symbol Canonical = { } # A list of LR item sets. A LR item set is a list of LR # items that represent the state of the parser # File objects used when creating the parser.out debugging file global _vf, _vfc _vf = cStringIO.StringIO() _vfc = cStringIO.StringIO() # ----------------------------------------------------------------------------- # class Production: # # This class stores the raw information about a single production or grammar rule. # It has a few required attributes: # # name - Name of the production (nonterminal) # prod - A list of symbols making up its production # number - Production number. # # In addition, a few additional attributes are used to help with debugging or # optimization of table generation. # # file - File where production action is defined. # lineno - Line number where action is defined # func - Action function # prec - Precedence level # lr_next - Next LR item. Example, if we are ' E -> E . PLUS E' # then lr_next refers to 'E -> E PLUS . E' # lr_index - LR item index (location of the ".") in the prod list. # lookaheads - LALR lookahead symbols for this item # len - Length of the production (number of symbols on right hand side) # ----------------------------------------------------------------------------- class Production: def __init__(self,**kw): for k,v in kw.items(): setattr(self,k,v) self.lr_index = -1 self.lr0_added = 0 # Flag indicating whether or not added to LR0 closure self.lr1_added = 0 # Flag indicating whether or not added to LR1 self.usyms = [ ] self.lookaheads = { } self.lk_added = 0 self.setnumbers = [ ] def __str__(self): if self.prod: s = "%s -> %s" % (self.name," ".join(self.prod)) else: s = "%s -> " % self.name return s def __repr__(self): return str(self) # Compute lr_items from the production def lr_item(self,n): if n > len(self.prod): return None p = Production() p.name = self.name p.prod = list(self.prod) p.number = self.number p.lr_index = n p.lookaheads = { } p.setnumbers = self.setnumbers p.prod.insert(n,".") p.prod = tuple(p.prod) p.len = len(p.prod) p.usyms = self.usyms # Precompute list of productions immediately following try: p.lrafter = Prodnames[p.prod[n+1]] except (IndexError,KeyError),e: p.lrafter = [] try: p.lrbefore = p.prod[n-1] except IndexError: p.lrbefore = None return p class MiniProduction: pass # Utility function def is_identifier(s): for c in s: if not (c.isalnum() or c == '_'): return 0 return 1 # ----------------------------------------------------------------------------- # add_production() # # Given an action function, this function assembles a production rule. # The production rule is assumed to be found in the function's docstring. # This rule has the general syntax: # # name1 ::= production1 # | production2 # | production3 # ... # | productionn # name2 ::= production1 # | production2 # ... # ----------------------------------------------------------------------------- def add_production(f,file,line,prodname,syms): if Terminals.has_key(prodname): sys.stderr.write("%s:%d: Illegal rule name '%s'. Already defined as a token.\n" % (file,line,prodname)) return -1 if prodname == 'error': sys.stderr.write("%s:%d: Illegal rule name '%s'. error is a reserved word.\n" % (file,line,prodname)) return -1 if not is_identifier(prodname): sys.stderr.write("%s:%d: Illegal rule name '%s'\n" % (file,line,prodname)) return -1 for s in syms: if not is_identifier(s) and s != '%prec': sys.stderr.write("%s:%d: Illegal name '%s' in rule '%s'\n" % (file,line,s, prodname)) return -1 # See if the rule is already in the rulemap map = "%s -> %s" % (prodname,syms) if Prodmap.has_key(map): m = Prodmap[map] sys.stderr.write("%s:%d: Duplicate rule %s.\n" % (file,line, m)) sys.stderr.write("%s:%d: Previous definition at %s:%d\n" % (file,line, m.file, m.line)) return -1 p = Production() p.name = prodname p.prod = syms p.file = file p.line = line p.func = f p.number = len(Productions) Productions.append(p) Prodmap[map] = p if not Nonterminals.has_key(prodname): Nonterminals[prodname] = [ ] # Add all terminals to Terminals i = 0 while i < len(p.prod): t = p.prod[i] if t == '%prec': try: precname = p.prod[i+1] except IndexError: sys.stderr.write("%s:%d: Syntax error. Nothing follows %%prec.\n" % (p.file,p.line)) return -1 prec = Precedence.get(precname,None) if not prec: sys.stderr.write("%s:%d: Nothing known about the precedence of '%s'\n" % (p.file,p.line,precname)) return -1 else: p.prec = prec del p.prod[i] del p.prod[i] continue if Terminals.has_key(t): Terminals[t].append(p.number) # Is a terminal. We'll assign a precedence to p based on this if not hasattr(p,"prec"): p.prec = Precedence.get(t,('right',0)) else: if not Nonterminals.has_key(t): Nonterminals[t] = [ ] Nonterminals[t].append(p.number) i += 1 if not hasattr(p,"prec"): p.prec = ('right',0) # Set final length of productions p.len = len(p.prod) p.prod = tuple(p.prod) # Calculate unique syms in the production p.usyms = [ ] for s in p.prod: if s not in p.usyms: p.usyms.append(s) # Add to the global productions list try: Prodnames[p.name].append(p) except KeyError: Prodnames[p.name] = [ p ] return 0 # Given a raw rule function, this function rips out its doc string # and adds rules to the grammar def add_function(f): line = f.func_code.co_firstlineno file = f.func_code.co_filename error = 0 if isinstance(f,types.MethodType): reqdargs = 2 else: reqdargs = 1 if f.func_code.co_argcount > reqdargs: sys.stderr.write("%s:%d: Rule '%s' has too many arguments.\n" % (file,line,f.__name__)) return -1 if f.func_code.co_argcount < reqdargs: sys.stderr.write("%s:%d: Rule '%s' requires an argument.\n" % (file,line,f.__name__)) return -1 if f.__doc__: # Split the doc string into lines pstrings = f.__doc__.splitlines() lastp = None dline = line for ps in pstrings: dline += 1 p = ps.split() if not p: continue try: if p[0] == '|': # This is a continuation of a previous rule if not lastp: sys.stderr.write("%s:%d: Misplaced '|'.\n" % (file,dline)) return -1 prodname = lastp if len(p) > 1: syms = p[1:] else: syms = [ ] else: prodname = p[0] lastp = prodname assign = p[1] if len(p) > 2: syms = p[2:] else: syms = [ ] if assign != ':' and assign != '::=': sys.stderr.write("%s:%d: Syntax error. Expected ':'\n" % (file,dline)) return -1 e = add_production(f,file,dline,prodname,syms) error += e except StandardError: sys.stderr.write("%s:%d: Syntax error in rule '%s'\n" % (file,dline,ps)) error -= 1 else: sys.stderr.write("%s:%d: No documentation string specified in function '%s'\n" % (file,line,f.__name__)) return error # Cycle checking code (Michael Dyck) def compute_reachable(): ''' Find each symbol that can be reached from the start symbol. Print a warning for any nonterminals that can't be reached. (Unused terminals have already had their warning.) ''' Reachable = { } for s in Terminals.keys() + Nonterminals.keys(): Reachable[s] = 0 mark_reachable_from( Productions[0].prod[0], Reachable ) for s in Nonterminals.keys(): if not Reachable[s]: sys.stderr.write("yacc: Symbol '%s' is unreachable.\n" % s) def mark_reachable_from(s, Reachable): ''' Mark all symbols that are reachable from symbol s. ''' if Reachable[s]: # We've already reached symbol s. return Reachable[s] = 1 for p in Prodnames.get(s,[]): for r in p.prod: mark_reachable_from(r, Reachable) # ----------------------------------------------------------------------------- # compute_terminates() # # This function looks at the various parsing rules and tries to detect # infinite recursion cycles (grammar rules where there is no possible way # to derive a string of only terminals). # ----------------------------------------------------------------------------- def compute_terminates(): ''' Raise an error for any symbols that don't terminate. ''' Terminates = {} # Terminals: for t in Terminals.keys(): Terminates[t] = 1 Terminates['$'] = 1 # Nonterminals: # Initialize to false: for n in Nonterminals.keys(): Terminates[n] = 0 # Then propagate termination until no change: while 1: some_change = 0 for (n,pl) in Prodnames.items(): # Nonterminal n terminates iff any of its productions terminates. for p in pl: # Production p terminates iff all of its rhs symbols terminate. for s in p.prod: if not Terminates[s]: # The symbol s does not terminate, # so production p does not terminate. p_terminates = 0 break else: # didn't break from the loop, # so every symbol s terminates # so production p terminates. p_terminates = 1 if p_terminates: # symbol n terminates! if not Terminates[n]: Terminates[n] = 1 some_change = 1 # Don't need to consider any more productions for this n. break if not some_change: break some_error = 0 for (s,terminates) in Terminates.items(): if not terminates: if not Prodnames.has_key(s) and not Terminals.has_key(s) and s != 'error': # s is used-but-not-defined, and we've already warned of that, # so it would be overkill to say that it's also non-terminating. pass else: sys.stderr.write("yacc: Infinite recursion detected for symbol '%s'.\n" % s) some_error = 1 return some_error # ----------------------------------------------------------------------------- # verify_productions() # # This function examines all of the supplied rules to see if they seem valid. # ----------------------------------------------------------------------------- def verify_productions(cycle_check=1): error = 0 for p in Productions: if not p: continue for s in p.prod: if not Prodnames.has_key(s) and not Terminals.has_key(s) and s != 'error': sys.stderr.write("%s:%d: Symbol '%s' used, but not defined as a token or a rule.\n" % (p.file,p.line,s)) error = 1 continue unused_tok = 0 # Now verify all of the tokens if yaccdebug: _vf.write("Unused terminals:\n\n") for s,v in Terminals.items(): if s != 'error' and not v: sys.stderr.write("yacc: Warning. Token '%s' defined, but not used.\n" % s) if yaccdebug: _vf.write(" %s\n"% s) unused_tok += 1 # Print out all of the productions if yaccdebug: _vf.write("\nGrammar\n\n") for i in range(1,len(Productions)): _vf.write("Rule %-5d %s\n" % (i, Productions[i])) unused_prod = 0 # Verify the use of all productions for s,v in Nonterminals.items(): if not v: p = Prodnames[s][0] sys.stderr.write("%s:%d: Warning. Rule '%s' defined, but not used.\n" % (p.file,p.line, s)) unused_prod += 1 if unused_tok == 1: sys.stderr.write("yacc: Warning. There is 1 unused token.\n") if unused_tok > 1: sys.stderr.write("yacc: Warning. There are %d unused tokens.\n" % unused_tok) if unused_prod == 1: sys.stderr.write("yacc: Warning. There is 1 unused rule.\n") if unused_prod > 1: sys.stderr.write("yacc: Warning. There are %d unused rules.\n" % unused_prod) if yaccdebug: _vf.write("\nTerminals, with rules where they appear\n\n") ks = Terminals.keys() ks.sort() for k in ks: _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Terminals[k]]))) _vf.write("\nNonterminals, with rules where they appear\n\n") ks = Nonterminals.keys() ks.sort() for k in ks: _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Nonterminals[k]]))) if (cycle_check): compute_reachable() error += compute_terminates() # error += check_cycles() return error # ----------------------------------------------------------------------------- # build_lritems() # # This function walks the list of productions and builds a complete set of the # LR items. The LR items are stored in two ways: First, they are uniquely # numbered and placed in the list _lritems. Second, a linked list of LR items # is built for each production. For example: # # E -> E PLUS E # # Creates the list # # [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ] # ----------------------------------------------------------------------------- def build_lritems(): for p in Productions: lastlri = p lri = p.lr_item(0) i = 0 while 1: lri = p.lr_item(i) lastlri.lr_next = lri if not lri: break lri.lr_num = len(LRitems) LRitems.append(lri) lastlri = lri i += 1 # In order for the rest of the parser generator to work, we need to # guarantee that no more lritems are generated. Therefore, we nuke # the p.lr_item method. (Only used in debugging) # Production.lr_item = None # ----------------------------------------------------------------------------- # add_precedence() # # Given a list of precedence rules, add to the precedence table. # ----------------------------------------------------------------------------- def add_precedence(plist): plevel = 0 error = 0 for p in plist: plevel += 1 try: prec = p[0] terms = p[1:] if prec != 'left' and prec != 'right' and prec != 'nonassoc': sys.stderr.write("yacc: Invalid precedence '%s'\n" % prec) return -1 for t in terms: if Precedence.has_key(t): sys.stderr.write("yacc: Precedence already specified for terminal '%s'\n" % t) error += 1 continue Precedence[t] = (prec,plevel) except: sys.stderr.write("yacc: Invalid precedence table.\n") error += 1 return error # ----------------------------------------------------------------------------- # augment_grammar() # # Compute the augmented grammar. This is just a rule S' -> start where start # is the starting symbol. # ----------------------------------------------------------------------------- def augment_grammar(start=None): if not start: start = Productions[1].name Productions[0] = Production(name="S'",prod=[start],number=0,len=1,prec=('right',0),func=None) Productions[0].usyms = [ start ] Nonterminals[start].append(0) # ------------------------------------------------------------------------- # first() # # Compute the value of FIRST1(beta) where beta is a tuple of symbols. # # During execution of compute_first1, the result may be incomplete. # Afterward (e.g., when called from compute_follow()), it will be complete. # ------------------------------------------------------------------------- def first(beta): # We are computing First(x1,x2,x3,...,xn) result = [ ] for x in beta: x_produces_empty = 0 # Add all the non- symbols of First[x] to the result. for f in First[x]: if f == '': x_produces_empty = 1 else: if f not in result: result.append(f) if x_produces_empty: # We have to consider the next x in beta, # i.e. stay in the loop. pass else: # We don't have to consider any further symbols in beta. break else: # There was no 'break' from the loop, # so x_produces_empty was true for all x in beta, # so beta produces empty as well. result.append('') return result # FOLLOW(x) # Given a non-terminal. This function computes the set of all symbols # that might follow it. Dragon book, p. 189. def compute_follow(start=None): # Add '$' to the follow list of the start symbol for k in Nonterminals.keys(): Follow[k] = [ ] if not start: start = Productions[1].name Follow[start] = [ '$' ] while 1: didadd = 0 for p in Productions[1:]: # Here is the production set for i in range(len(p.prod)): B = p.prod[i] if Nonterminals.has_key(B): # Okay. We got a non-terminal in a production fst = first(p.prod[i+1:]) hasempty = 0 for f in fst: if f != '' and f not in Follow[B]: Follow[B].append(f) didadd = 1 if f == '': hasempty = 1 if hasempty or i == (len(p.prod)-1): # Add elements of follow(a) to follow(b) for f in Follow[p.name]: if f not in Follow[B]: Follow[B].append(f) didadd = 1 if not didadd: break if 0 and yaccdebug: _vf.write('\nFollow:\n') for k in Nonterminals.keys(): _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Follow[k]]))) # ------------------------------------------------------------------------- # compute_first1() # # Compute the value of FIRST1(X) for all symbols # ------------------------------------------------------------------------- def compute_first1(): # Terminals: for t in Terminals.keys(): First[t] = [t] First['$'] = ['$'] First['#'] = ['#'] # what's this for? # Nonterminals: # Initialize to the empty set: for n in Nonterminals.keys(): First[n] = [] # Then propagate symbols until no change: while 1: some_change = 0 for n in Nonterminals.keys(): for p in Prodnames[n]: for f in first(p.prod): if f not in First[n]: First[n].append( f ) some_change = 1 if not some_change: break if 0 and yaccdebug: _vf.write('\nFirst:\n') for k in Nonterminals.keys(): _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in First[k]]))) # ----------------------------------------------------------------------------- # === SLR Generation === # # The following functions are used to construct SLR (Simple LR) parsing tables # as described on p.221-229 of the dragon book. # ----------------------------------------------------------------------------- # Global variables for the LR parsing engine def lr_init_vars(): global _lr_action, _lr_goto, _lr_method global _lr_goto_cache _lr_action = { } # Action table _lr_goto = { } # Goto table _lr_method = "Unknown" # LR method used _lr_goto_cache = { } # Compute the LR(0) closure operation on I, where I is a set of LR(0) items. # prodlist is a list of productions. _add_count = 0 # Counter used to detect cycles def lr0_closure(I): global _add_count _add_count += 1 prodlist = Productions # Add everything in I to J J = I[:] didadd = 1 while didadd: didadd = 0 for j in J: for x in j.lrafter: if x.lr0_added == _add_count: continue # Add B --> .G to J J.append(x.lr_next) x.lr0_added = _add_count didadd = 1 return J # Compute the LR(0) goto function goto(I,X) where I is a set # of LR(0) items and X is a grammar symbol. This function is written # in a way that guarantees uniqueness of the generated goto sets # (i.e. the same goto set will never be returned as two different Python # objects). With uniqueness, we can later do fast set comparisons using # id(obj) instead of element-wise comparison. def lr0_goto(I,x): # First we look for a previously cached entry g = _lr_goto_cache.get((id(I),x),None) if g: return g # Now we generate the goto set in a way that guarantees uniqueness # of the result s = _lr_goto_cache.get(x,None) if not s: s = { } _lr_goto_cache[x] = s gs = [ ] for p in I: n = p.lr_next if n and n.lrbefore == x: s1 = s.get(id(n),None) if not s1: s1 = { } s[id(n)] = s1 gs.append(n) s = s1 g = s.get('$',None) if not g: if gs: g = lr0_closure(gs) s['$'] = g else: s['$'] = gs _lr_goto_cache[(id(I),x)] = g return g # Added for LALR(1) # Given a setnumber of an lr0 state and a symbol return the setnumber of the goto state def lr0_goto_setnumber(I_setnumber, x): global Canonical global GotoSetNum if GotoSetNum.has_key((I_setnumber, x)): setnumber = GotoSetNum[(I_setnumber, x)] else: gset = lr0_goto(Canonical[I_setnumber], x) if not gset: return -1 else: gsetlen = len(gset) for i in xrange(len(gset[0].setnumbers)): inall = 1 for item in gset: if not item.setnumbers[i]: inall = 0 break if inall and len(Canonical[i]) == gsetlen: setnumber = i break # Note: DB. I added this to improve performance. # Not sure if this breaks the algorithm (it doesn't appear to). GotoSetNum[(I_setnumber, x)] = setnumber return setnumber # Compute the kernel of a set of LR(0) items def lr0_kernel(I): KI = [ ] for p in I: if p.name == "S'" or p.lr_index > 0 or p.len == 0: KI.append(p) return KI _lr0_cidhash = { } # Compute the LR(0) sets of item function def lr0_items(): C = [ lr0_closure([Productions[0].lr_next]) ] i = 0 for I in C: _lr0_cidhash[id(I)] = i i += 1 # Loop over the items in C and each grammar symbols i = 0 while i < len(C): I = C[i] i += 1 # Collect all of the symbols that could possibly be in the goto(I,X) sets asyms = { } for ii in I: for s in ii.usyms: asyms[s] = None for x in asyms.keys(): g = lr0_goto(I,x) if not g: continue if _lr0_cidhash.has_key(id(g)): continue _lr0_cidhash[id(g)] = len(C) C.append(g) return C # ----------------------------------------------------------------------------- # slr_parse_table() # # This function constructs an SLR table. # ----------------------------------------------------------------------------- def slr_parse_table(): global _lr_method goto = _lr_goto # Goto array action = _lr_action # Action array actionp = { } # Action production array (temporary) _lr_method = "SLR" n_srconflict = 0 n_rrconflict = 0 if yaccdebug: sys.stderr.write("yacc: Generating SLR parsing table...\n") _vf.write("\n\nParsing method: SLR\n\n") # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items # This determines the number of states C = lr0_items() # Build the parser table, state by state st = 0 for I in C: # Loop over each production in I actlist = [ ] # List of actions if yaccdebug: _vf.write("\nstate %d\n\n" % st) for p in I: _vf.write(" (%d) %s\n" % (p.number, str(p))) _vf.write("\n") for p in I: try: if p.prod[-1] == ".": if p.name == "S'": # Start symbol. Accept! action[st,"$"] = 0 actionp[st,"$"] = p else: # We are at the end of a production. Reduce! for a in Follow[p.name]: actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p))) r = action.get((st,a),None) if r is not None: # Whoa. Have a shift/reduce or reduce/reduce conflict if r > 0: # Need to decide on shift or reduce here # By default we favor shifting. Need to add # some precedence rules here. sprec,slevel = Productions[actionp[st,a].number].prec rprec,rlevel = Precedence.get(a,('right',0)) if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): # We really need to reduce here. action[st,a] = -p.number actionp[st,a] = p if not slevel and not rlevel: _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) n_srconflict += 1 elif (slevel == rlevel) and (rprec == 'nonassoc'): action[st,a] = None else: # Hmmm. Guess we'll keep the shift if not slevel and not rlevel: _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) n_srconflict +=1 elif r < 0: # Reduce/reduce conflict. In this case, we favor the rule # that was defined first in the grammar file oldp = Productions[-r] pp = Productions[p.number] if oldp.line > pp.line: action[st,a] = -p.number actionp[st,a] = p # sys.stderr.write("Reduce/reduce conflict in state %d\n" % st) n_rrconflict += 1 _vfc.write("reduce/reduce conflict in state %d resolved using rule %d (%s).\n" % (st, actionp[st,a].number, actionp[st,a])) _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d (%s).\n" % (a,actionp[st,a].number, actionp[st,a])) else: sys.stderr.write("Unknown conflict in state %d\n" % st) else: action[st,a] = -p.number actionp[st,a] = p else: i = p.lr_index a = p.prod[i+1] # Get symbol right after the "." if Terminals.has_key(a): g = lr0_goto(I,a) j = _lr0_cidhash.get(id(g),-1) if j >= 0: # We are in a shift state actlist.append((a,p,"shift and go to state %d" % j)) r = action.get((st,a),None) if r is not None: # Whoa have a shift/reduce or shift/shift conflict if r > 0: if r != j: sys.stderr.write("Shift/shift conflict in state %d\n" % st) elif r < 0: # Do a precedence check. # - if precedence of reduce rule is higher, we reduce. # - if precedence of reduce is same and left assoc, we reduce. # - otherwise we shift rprec,rlevel = Productions[actionp[st,a].number].prec sprec,slevel = Precedence.get(a,('right',0)) if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')): # We decide to shift here... highest precedence to shift action[st,a] = j actionp[st,a] = p if not slevel and not rlevel: n_srconflict += 1 _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) elif (slevel == rlevel) and (rprec == 'nonassoc'): action[st,a] = None else: # Hmmm. Guess we'll keep the reduce if not slevel and not rlevel: n_srconflict +=1 _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) else: sys.stderr.write("Unknown conflict in state %d\n" % st) else: action[st,a] = j actionp[st,a] = p except StandardError,e: raise YaccError, "Hosed in slr_parse_table", e # Print the actions associated with each terminal if yaccdebug: _actprint = { } for a,p,m in actlist: if action.has_key((st,a)): if p is actionp[st,a]: _vf.write(" %-15s %s\n" % (a,m)) _actprint[(a,m)] = 1 _vf.write("\n") for a,p,m in actlist: if action.has_key((st,a)): if p is not actionp[st,a]: if not _actprint.has_key((a,m)): _vf.write(" ! %-15s [ %s ]\n" % (a,m)) _actprint[(a,m)] = 1 # Construct the goto table for this state if yaccdebug: _vf.write("\n") nkeys = { } for ii in I: for s in ii.usyms: if Nonterminals.has_key(s): nkeys[s] = None for n in nkeys.keys(): g = lr0_goto(I,n) j = _lr0_cidhash.get(id(g),-1) if j >= 0: goto[st,n] = j if yaccdebug: _vf.write(" %-30s shift and go to state %d\n" % (n,j)) st += 1 if yaccdebug: if n_srconflict == 1: sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict) if n_srconflict > 1: sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict) if n_rrconflict == 1: sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict) if n_rrconflict > 1: sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict) # ----------------------------------------------------------------------------- # ==== LALR(1) Parsing ==== # FINISHED! 5/20/2003 by Elias Ioup # ----------------------------------------------------------------------------- # Compute the lr1_closure of a set I. I is a list of productions and setnumber # is the state that you want the lr items that are made from the to come from. _lr1_add_count = 0 def lr1_closure(I, setnumber = 0): global _add_count global Nonterminals _add_count += 1 prodlist = Productions # Add everything in I to J J = I[:] Jhash = { } for j in J: Jhash[id(j)] = 1 didadd = 1 while didadd: didadd = 0 for j in J: jprod = j.prod jlr_index = j.lr_index jprodslice = jprod[jlr_index+2:] if jlr_index < len(jprod) - 1 and Nonterminals.has_key(jprod[jlr_index+1]): first_syms = [] if j.lk_added < len(j.lookaheads[setnumber]): for a in j.lookaheads[setnumber][j.lk_added:]: # find b in FIRST(Xa) if j = [A->a.BX,a] temp_first_syms = first(jprodslice + (a,)) for x in temp_first_syms: if x not in first_syms: first_syms.append(x) j.lk_added = len(j.lookaheads[setnumber]) for x in j.lrafter: # Add B --> .G to J if x.lr_next.lookaheads.has_key(setnumber): _xlook = x.lr_next.lookaheads[setnumber] for s in first_syms: if s not in _xlook: _xlook.append(s) didadd = 1 else: didadd = 0 else: x.lr_next.lookaheads[setnumber] = first_syms didadd = 1 nid = id(x.lr_next) if not Jhash.has_key(nid): J.append(x.lr_next) Jhash[nid] = 1 return J def add_lookaheads(K): spontaneous = [] propogate = [] for setnumber in range(len(K)): for kitem in K[setnumber]: kitem.lookaheads[setnumber] = ['#'] J = lr1_closure([kitem], setnumber) # find the lookaheads that are spontaneously created from closures # and the propogations of lookaheads between lr items for item in J: if item.lr_index < len(item.prod)-1: for lookahead in item.lookaheads[setnumber]: if lookahead != '#': goto_setnumber = lr0_goto_setnumber(setnumber, item.prod[item.lr_index+1]) next = None if item.lr_next in K[goto_setnumber]: next = item.lr_next if next: spontaneous.append((next, (lookahead, goto_setnumber))) else: goto_setnumber = lr0_goto_setnumber(setnumber, item.prod[item.lr_index+1]) next = None if goto_setnumber > -1: if item.lr_next in K[goto_setnumber]: next = item.lr_next if next: propogate.append(((kitem, setnumber), (next, goto_setnumber))) for x in K[setnumber]: x.lookaheads[setnumber] = [] for x in spontaneous: if x[1][0] not in x[0].lookaheads[x[1][1]]: x[0].lookaheads[x[1][1]].append(x[1][0]) K[0][0].lookaheads[0] = ['$'] pitems = {} for x in propogate: if pitems.has_key(x[0]): pitems[x[0]].append(x[1]) else: pitems[x[0]] = [] pitems[x[0]].append(x[1]) # propogate the lookaheads that were spontaneously generated # based on the propogations produced above stop = 0 while not stop: stop = 1 kindex = 0 for set in K: for item in set: pkey = (item, kindex) if pitems.has_key(pkey): for propogation in pitems[pkey]: gitem = propogation[0] gsetnumber = propogation[1] glookaheads = gitem.lookaheads[gsetnumber] for lookahead in item.lookaheads[kindex]: if lookahead not in glookaheads: glookaheads.append(lookahead) stop = 0 kindex += 1 def ReduceNonterminals(): global Nonterminals global TReductions global NTReductions for nt in Nonterminals.keys(): TReductions[nt] = [] NTReductions[nt] = [] for nt in Nonterminals.keys(): terms = ReduceToTerminals(nt) TReductions[nt].extend(terms) if not NTReductions.has_key(nt): ReduceToNonterminals(nt) def ReduceToTerminals(nt): global Prodnames global Terminals reducedterminals = [] for p in Prodnames[nt]: if len(p.prod) > 0: if Terminals.has_key(p.prod[0]): if p.prod[0] not in reducedterminals: reducedterminals.append(p.prod[0]) else: if p.prod[0] != nt: terms = ReduceToTerminals(p.prod[0]) for t in terms: if t not in reducedterminals: reducedterminals.append(t) return reducedterminals def ReduceToNonterminals(nt): global Prodnames global Nonterminals global NTReductions reducednonterminals = [] for p in Prodnames[nt]: if len(p.prod) > 0: if Nonterminals.has_key(p.prod[0]): if p.prod[0] not in reducednonterminals: reducednonterminals.append(p.prod[0]) if p.prod[0] != nt: if not NTReductions.has_key(p.prod[0]): ReduceToNonterminals(p.prod[0]) nterms = NTReductions[p.prod[0]] for nt in nterms: if nt not in reducednonterminals: reducednonterminals.append(nt) NTReductions[nt] = reducednonterminals # ----------------------------------------------------------------------------- # lalr_parse_table() # # This function constructs an LALR table. # ----------------------------------------------------------------------------- def lalr_parse_table(): global _lr_method goto = _lr_goto # Goto array action = _lr_action # Action array actionp = { } # Action production array (temporary) goto_cache = _lr_goto_cache cid_hash = _lr0_cidhash _lr_method = "LALR" n_srconflict = 0 n_rrconflict = 0 if yaccdebug: sys.stderr.write("yacc: Generating LALR(1) parsing table...\n") _vf.write("\n\nParsing method: LALR(1)\n\n") # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items # This determines the number of states C = lr0_items() global Canonical Canonical = C ### # Create the kernel states. ### K = [] setC = [0]*len(C) for x in C: K.append(lr0_kernel(x)) for y in x: y.setnumbers = setC[:] _cindex = 0 for x in C: for y in x: y.lookaheads[_cindex] = [] y.setnumbers[_cindex] = 1 _cindex = _cindex + 1 ### # Add lookaheads to the lr items ### add_lookaheads(K) ### # Do the reductions for parsing first and keep them in globals ### ReduceNonterminals() global TReductions global NTReductions global Prodempty EmptyAncestors = {} for y in Prodempty.keys(): EmptyAncestors[y] = [] for x in NTReductions.items(): for y in x[1]: if Prodempty.has_key(y): EmptyAncestors[y].append(x[0]) # Build the parser table, state by state st = 0 for I in C: # Loop over each production in I actlist = [ ] # List of actions acthash = { } idI = id(I) if yaccdebug: _vf.write("\nstate %d\n\n" % st) for p in I: _vf.write(" (%d) %s\n" % (p.number, str(p))) _vf.write("\n") global First for p in I: try: if p.prod[-1] == ".": if p.name == "S'": # Start symbol. Accept! action[st,"$"] = 0 actionp[st,"$"] = p elif len(p.prod) == 0: ancestors = EmptyAncestors[p.name] for i in ancestors: for s in K: if i in s: input_list = [] plist = Productions[i.name] for x in plist: if len(x.prod) > 0 and x.prod[0] == p.name: n = p.prod[1:] d = x.prod[lr_index+2:] for l in x.lookaheads.items(): flist = First[tuple(n+d+[l])] for f in flist: if f not in input_list and f in p.lookaheads[st]: input_list.append(f) # We are at the end of a production. Reduce! #print "input_list: %s" % input_list #print "Follow[p.name]: %s" % Follow[p.name] for a in input_list: actlist.append((a,p,"reduce using rule %d (%s) " % (p.number,p))) r = action.get((st,a),None) if r is not None: # Whoa. Have a shift/reduce or reduce/reduce conflict if r > 0: # Need to decide on shift or reduce here # By default we favor shifting. Need to add # some precedence rules here. sprec,slevel = Productions[actionp[st,a].number].prec rprec,rlevel = Precedence.get(a,('right',0)) if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): # We really need to reduce here. action[st,a] = -p.number actionp[st,a] = p if not slevel and not rlevel: _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) n_srconflict += 1 elif (slevel == rlevel) and (rprec == 'nonassoc'): action[st,a] = None else: # Hmmm. Guess we'll keep the shift if not slevel and not rlevel: _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) n_srconflict +=1 elif r < 0: # Reduce/reduce conflict. In this case, we favor the rule # that was defined first in the grammar file oldp = Productions[-r] pp = Productions[p.number] if oldp.line > pp.line: action[st,a] = -p.number actionp[st,a] = p # print "Reduce/reduce conflict in state %d" % st n_rrconflict += 1 _vfc.write("reduce/reduce conflict in state %d resolved using rule %d.\n" % (st, actionp[st,a].number)) _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d.\n" % (a,actionp[st,a].number)) else: sys.stderr.write("Unknown conflict in state %d\n" % st) else: action[st,a] = -p.number actionp[st,a] = p break # break out of the for s in K loop because we only want to make # sure that a production is in the Kernel else: # We are at the end of a production. Reduce! for a in p.lookaheads[st]: actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p))) r = action.get((st,a),None) if r is not None: # Whoa. Have a shift/reduce or reduce/reduce conflict if r > 0: # Need to decide on shift or reduce here # By default we favor shifting. Need to add # some precedence rules here. sprec,slevel = Productions[actionp[st,a].number].prec rprec,rlevel = Precedence.get(a,('right',0)) if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): # We really need to reduce here. action[st,a] = -p.number actionp[st,a] = p if not slevel and not rlevel: _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) n_srconflict += 1 elif (slevel == rlevel) and (rprec == 'nonassoc'): action[st,a] = None else: # Hmmm. Guess we'll keep the shift if not slevel and not rlevel: _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) n_srconflict +=1 elif r < 0: # Reduce/reduce conflict. In this case, we favor the rule # that was defined first in the grammar file oldp = Productions[-r] pp = Productions[p.number] if oldp.line > pp.line: action[st,a] = -p.number actionp[st,a] = p # print "Reduce/reduce conflict in state %d" % st n_rrconflict += 1 _vfc.write("reduce/reduce conflict in state %d resolved using rule %d.\n" % (st, actionp[st,a].number)) _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d.\n" % (a,actionp[st,a].number)) else: print "Unknown conflict in state %d" % st else: action[st,a] = -p.number actionp[st,a] = p else: i = p.lr_index a = p.prod[i+1] # Get symbol right after the "." if Terminals.has_key(a): g = goto_cache[(idI,a)] j = cid_hash.get(id(g),-1) if j >= 0: # We are in a shift state _k = (a,j) if not acthash.has_key(_k): actlist.append((a,p,"shift and go to state %d" % j)) acthash[_k] = 1 r = action.get((st,a),None) if r is not None: # Whoa have a shift/reduce or shift/shift conflict if r > 0: if r != j: sys.stderr.write("Shift/shift conflict in state %d\n" % st) elif r < 0: # Do a precedence check. # - if precedence of reduce rule is higher, we reduce. # - if precedence of reduce is same and left assoc, we reduce. # - otherwise we shift rprec,rlevel = Productions[actionp[st,a].number].prec sprec,slevel = Precedence.get(a,('right',0)) if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')): # We decide to shift here... highest precedence to shift action[st,a] = j actionp[st,a] = p if not slevel and not rlevel: n_srconflict += 1 _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) elif (slevel == rlevel) and (rprec == 'nonassoc'): action[st,a] = None else: # Hmmm. Guess we'll keep the reduce if not slevel and not rlevel: n_srconflict +=1 _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) else: sys.stderr.write("Unknown conflict in state %d\n" % st) else: action[st,a] = j actionp[st,a] = p else: nonterminal = a term_list = TReductions[nonterminal] # DB: This loop gets executed a lot. Try to optimize for a in term_list: g = goto_cache[(idI,a)] j = cid_hash[id(g)] if j >= 0: # We are in a shift state # Don't put repeated shift actions on action list (performance hack) _k = (a,j) if not acthash.has_key(_k): actlist.append((a,p,"shift and go to state "+str(j))) acthash[_k] = 1 r = action.get((st,a),None) if r is not None: # Whoa have a shift/reduce or shift/shift conflict if r > 0: if r != j: sys.stderr.write("Shift/shift conflict in state %d\n" % st) continue elif r < 0: # Do a precedence check. # - if precedence of reduce rule is higher, we reduce. # - if precedence of reduce is same and left assoc, we reduce. # - otherwise we shift rprec,rlevel = Productions[actionp[st,a].number].prec sprec,slevel = Precedence.get(a,('right',0)) if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')): # We decide to shift here... highest precedence to shift action[st,a] = j actionp[st,a] = p if not slevel and not rlevel: n_srconflict += 1 _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) elif (slevel == rlevel) and (rprec == 'nonassoc'): action[st,a] = None else: # Hmmm. Guess we'll keep the reduce if not slevel and not rlevel: n_srconflict +=1 _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) else: sys.stderr.write("Unknown conflict in state %d\n" % st) else: action[st,a] = j actionp[st,a] = p except StandardError,e: raise YaccError, "Hosed in lalr_parse_table", e # Print the actions associated with each terminal if yaccdebug: for a,p,m in actlist: if action.has_key((st,a)): if p is actionp[st,a]: _vf.write(" %-15s %s\n" % (a,m)) _vf.write("\n") for a,p,m in actlist: if action.has_key((st,a)): if p is not actionp[st,a]: _vf.write(" ! %-15s [ %s ]\n" % (a,m)) # Construct the goto table for this state nkeys = { } for ii in I: for s in ii.usyms: if Nonterminals.has_key(s): nkeys[s] = None # Construct the goto table for this state for n in nkeys.keys(): g = lr0_goto(I,n) j = cid_hash.get(id(g),-1) if j >= 0: goto[st,n] = j if yaccdebug: _vf.write(" %-30s shift and go to state %d\n" % (n,j)) st += 1 if yaccdebug: if n_srconflict == 1: sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict) if n_srconflict > 1: sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict) if n_rrconflict == 1: sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict) if n_rrconflict > 1: sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict) # ----------------------------------------------------------------------------- # ==== LR Utility functions ==== # ----------------------------------------------------------------------------- # ----------------------------------------------------------------------------- # _lr_write_tables() # # This function writes the LR parsing tables to a file # ----------------------------------------------------------------------------- def lr_write_tables(modulename=tab_module): filename = modulename + ".py" try: f = open(filename,"w") f.write(""" # %s # This file is automatically generated. Do not edit. _lr_method = %s _lr_signature = %s """ % (filename, repr(_lr_method), repr(Signature.digest()))) # Change smaller to 0 to go back to original tables smaller = 1 # Factor out names to try and make smaller if smaller: items = { } for k,v in _lr_action.items(): i = items.get(k[1]) if not i: i = ([],[]) items[k[1]] = i i[0].append(k[0]) i[1].append(v) f.write("\n_lr_action_items = {") for k,v in items.items(): f.write("%r:([" % k) for i in v[0]: f.write("%r," % i) f.write("],[") for i in v[1]: f.write("%r," % i) f.write("]),") f.write("}\n") f.write(""" _lr_action = { } for _k, _v in _lr_action_items.items(): for _x,_y in zip(_v[0],_v[1]): _lr_action[(_x,_k)] = _y del _lr_action_items """) else: f.write("\n_lr_action = { "); for k,v in _lr_action.items(): f.write("(%r,%r):%r," % (k[0],k[1],v)) f.write("}\n"); if smaller: # Factor out names to try and make smaller items = { } for k,v in _lr_goto.items(): i = items.get(k[1]) if not i: i = ([],[]) items[k[1]] = i i[0].append(k[0]) i[1].append(v) f.write("\n_lr_goto_items = {") for k,v in items.items(): f.write("%r:([" % k) for i in v[0]: f.write("%r," % i) f.write("],[") for i in v[1]: f.write("%r," % i) f.write("]),") f.write("}\n") f.write(""" _lr_goto = { } for _k, _v in _lr_goto_items.items(): for _x,_y in zip(_v[0],_v[1]): _lr_goto[(_x,_k)] = _y del _lr_goto_items """) else: f.write("\n_lr_goto = { "); for k,v in _lr_goto.items(): f.write("(%r,%r):%r," % (k[0],k[1],v)) f.write("}\n"); # Write production table f.write("_lr_productions = [\n") for p in Productions: if p: if (p.func): f.write(" (%r,%d,%r,%r,%d),\n" % (p.name, p.len, p.func.__name__,p.file,p.line)) else: f.write(" (%r,%d,None,None,None),\n" % (p.name, p.len)) else: f.write(" None,\n") f.write("]\n") f.close() except IOError,e: print "Unable to create '%s'" % filename print e return def lr_read_tables(module=tab_module,optimize=0): global _lr_action, _lr_goto, _lr_productions, _lr_method try: exec "import %s as parsetab" % module if (optimize) or (Signature.digest() == parsetab._lr_signature): _lr_action = parsetab._lr_action _lr_goto = parsetab._lr_goto _lr_productions = parsetab._lr_productions _lr_method = parsetab._lr_method return 1 else: return 0 except (ImportError,AttributeError): return 0 # ----------------------------------------------------------------------------- # yacc(module) # # Build the parser module # ----------------------------------------------------------------------------- def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, start=None, check_recursion=1, optimize=0,write_tables=1,debugfile=debug_file): global yaccdebug yaccdebug = debug initialize_vars() files = { } error = 0 # Add starting symbol to signature if start: Signature.update(start) # Add parsing method to signature Signature.update(method) # If a "module" parameter was supplied, extract its dictionary. # Note: a module may in fact be an instance as well. if module: # User supplied a module object. if isinstance(module, types.ModuleType): ldict = module.__dict__ elif isinstance(module, types.InstanceType): _items = [(k,getattr(module,k)) for k in dir(module)] ldict = { } for i in _items: ldict[i[0]] = i[1] else: raise ValueError,"Expected a module" else: # No module given. We might be able to get information from the caller. # Throw an exception and unwind the traceback to get the globals try: raise RuntimeError except RuntimeError: e,b,t = sys.exc_info() f = t.tb_frame f = f.f_back # Walk out to our calling function ldict = f.f_globals # Grab its globals dictionary # If running in optimized mode. We're going to if (optimize and lr_read_tables(tabmodule,1)): # Read parse table del Productions[:] for p in _lr_productions: if not p: Productions.append(None) else: m = MiniProduction() m.name = p[0] m.len = p[1] m.file = p[3] m.line = p[4] if p[2]: m.func = ldict[p[2]] Productions.append(m) else: # Get the tokens map if (module and isinstance(module,types.InstanceType)): tokens = getattr(module,"tokens",None) else: tokens = ldict.get("tokens",None) if not tokens: raise YaccError,"module does not define a list 'tokens'" if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)): raise YaccError,"tokens must be a list or tuple." # Check to see if a requires dictionary is defined. requires = ldict.get("require",None) if requires: if not (isinstance(requires,types.DictType)): raise YaccError,"require must be a dictionary." for r,v in requires.items(): try: if not (isinstance(v,types.ListType)): raise TypeError v1 = [x.split(".") for x in v] Requires[r] = v1 except StandardError: print "Invalid specification for rule '%s' in require. Expected a list of strings" % r # Build the dictionary of terminals. We a record a 0 in the # dictionary to track whether or not a terminal is actually # used in the grammar if 'error' in tokens: print "yacc: Illegal token 'error'. Is a reserved word." raise YaccError,"Illegal token name" for n in tokens: if Terminals.has_key(n): print "yacc: Warning. Token '%s' multiply defined." % n Terminals[n] = [ ] Terminals['error'] = [ ] # Get the precedence map (if any) prec = ldict.get("precedence",None) if prec: if not (isinstance(prec,types.ListType) or isinstance(prec,types.TupleType)): raise YaccError,"precedence must be a list or tuple." add_precedence(prec) Signature.update(repr(prec)) for n in tokens: if not Precedence.has_key(n): Precedence[n] = ('right',0) # Default, right associative, 0 precedence # Look for error handler ef = ldict.get('p_error',None) if ef: if isinstance(ef,types.FunctionType): ismethod = 0 elif isinstance(ef, types.MethodType): ismethod = 1 else: raise YaccError,"'p_error' defined, but is not a function or method." eline = ef.func_code.co_firstlineno efile = ef.func_code.co_filename files[efile] = None if (ef.func_code.co_argcount != 1+ismethod): raise YaccError,"%s:%d: p_error() requires 1 argument." % (efile,eline) global Errorfunc Errorfunc = ef else: print "yacc: Warning. no p_error() function is defined." # Get the list of built-in functions with p_ prefix symbols = [ldict[f] for f in ldict.keys() if (type(ldict[f]) in (types.FunctionType, types.MethodType) and ldict[f].__name__[:2] == 'p_' and ldict[f].__name__ != 'p_error')] # Check for non-empty symbols if len(symbols) == 0: raise YaccError,"no rules of the form p_rulename are defined." # Sort the symbols by line number symbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno)) # Add all of the symbols to the grammar for f in symbols: if (add_function(f)) < 0: error += 1 else: files[f.func_code.co_filename] = None # Make a signature of the docstrings for f in symbols: if f.__doc__: Signature.update(f.__doc__) lr_init_vars() if error: raise YaccError,"Unable to construct parser." if not lr_read_tables(tabmodule): # Validate files for filename in files.keys(): if not validate_file(filename): error = 1 # Validate dictionary validate_dict(ldict) if start and not Prodnames.has_key(start): raise YaccError,"Bad starting symbol '%s'" % start augment_grammar(start) error = verify_productions(cycle_check=check_recursion) otherfunc = [ldict[f] for f in ldict.keys() if (type(f) in (types.FunctionType,types.MethodType) and ldict[f].__name__[:2] != 'p_')] if error: raise YaccError,"Unable to construct parser." build_lritems() compute_first1() compute_follow(start) if method == 'SLR': slr_parse_table() elif method == 'LALR': lalr_parse_table() else: raise YaccError, "Unknown parsing method '%s'" % method if write_tables: lr_write_tables(tabmodule) if yaccdebug: try: f = open(debugfile,"w") f.write(_vfc.getvalue()) f.write("\n\n") f.write(_vf.getvalue()) f.close() except IOError,e: print "yacc: can't create '%s'" % debugfile,e # Made it here. Create a parser object and set up its internal state. # Set global parse() method to bound method of parser object. p = Parser("xyzzy") p.productions = Productions p.errorfunc = Errorfunc p.action = _lr_action p.goto = _lr_goto p.method = _lr_method p.require = Requires global parse parse = p.parse # Clean up all of the globals we created if (not optimize): yacc_cleanup() return p # yacc_cleanup function. Delete all of the global variables # used during table construction def yacc_cleanup(): global _lr_action, _lr_goto, _lr_method, _lr_goto_cache del _lr_action, _lr_goto, _lr_method, _lr_goto_cache global Productions, Prodnames, Prodmap, Terminals global Nonterminals, First, Follow, Precedence, LRitems global Errorfunc, Signature, Requires global Prodempty, TReductions, NTReductions, GotoSetNum, Canonical del Productions, Prodnames, Prodmap, Terminals del Nonterminals, First, Follow, Precedence, LRitems del Errorfunc, Signature, Requires del Prodempty, TReductions, NTReductions, GotoSetNum, Canonical global _vf, _vfc del _vf, _vfc # Stub that raises an error if parsing is attempted without first calling yacc() def parse(*args,**kwargs): raise YaccError, "yacc: No parser built with yacc()" - + diff --git a/modules/elmsubmit/lib/yacc.py.wml b/modules/elmsubmit/lib/yacc.py.wml deleted file mode 100644 index 9b9ffc79e..000000000 --- a/modules/elmsubmit/lib/yacc.py.wml +++ /dev/null @@ -1,2418 +0,0 @@ -#----------------------------------------------------------------------------- -# ply: yacc.py -# -# Author(s): David M. Beazley (beazley@cs.uchicago.edu) -# Department of Computer Science -# University of Chicago -# Chicago, IL 60637 -# -# Copyright (C) 2001-2004, David M. Beazley -# -# $Header$ -# -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License, or (at your option) any later version. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -# -# See the file COPYING for a complete copy of the LGPL. -# -# -# This implements an LR parser that is constructed from grammar rules defined -# as Python functions. Roughly speaking, this module is a cross between -# John Aycock's Spark system and the GNU bison utility. -# -# The current implementation is only somewhat object-oriented. The -# LR parser itself is defined in terms of an object (which allows multiple -# parsers to co-exist). However, most of the variables used during table -# construction are defined in terms of global variables. Users shouldn't -# notice unless they are trying to define multiple parsers at the same -# time using threads (in which case they should have their head examined). -# -# This implementation supports both SLR and LALR(1) parsing. LALR(1) -# support was implemented by Elias Ioup (ezioup@alumni.uchicago.edu) -# and hacked abit by Dave to run faster. -# -# :::::::: WARNING ::::::: -# -# Construction of LR parsing tables is fairly complicated and expensive. -# To make this module run fast, a *LOT* of work has been put into -# optimization---often at the expensive of readability and what might -# consider to be good Python "coding style." Modify the code at your -# own risk! -# ---------------------------------------------------------------------------- - -__version__ = "1.5" - -#----------------------------------------------------------------------------- -# === User configurable parameters === -# -# Change these to modify the default behavior of yacc (if you wish) -#----------------------------------------------------------------------------- - -yaccdebug = 1 # Debugging mode. If set, yacc generates a - # a 'parser.out' file in the current directory - -debug_file = 'parser.out' # Default name of the debugging file -tab_module = 'parsetab' # Default name of the table module -default_lr = 'SLR' # Default LR table generation method - -error_count = 3 # Number of symbols that must be shifted to leave recovery mode - -import re, types, sys, cStringIO, md5, os.path - -# Exception raised for yacc-related errors -class YaccError(Exception): pass - -#----------------------------------------------------------------------------- -# === LR Parsing Engine === -# -# The following classes are used for the LR parser itself. These are not -# used during table construction and are independent of the actual LR -# table generation algorithm -#----------------------------------------------------------------------------- - -# This class is used to hold non-terminal grammar symbols during parsing. -# It normally has the following attributes set: -# .type = Grammar symbol type -# .value = Symbol value -# .lineno = Starting line number -# .endlineno = Ending line number (optional, set automatically) - -class YaccSymbol: - def __str__(self): return self.type - def __repr__(self): return str(self) - -# This class is a wrapper around the objects actually passed to each -# grammar rule. Index lookup and assignment actually assign the -# .value attribute of the underlying YaccSymbol object. -# The lineno() method returns the line number of a given -# item (or 0 if not defined). The linespan() method returns -# a tuple of (startline,endline) representing the range of lines -# for a symbol. - -class YaccProduction: - def __init__(self,s): - self.slice = s - self.pbstack = [] - - def __getitem__(self,n): - return self.slice[n].value - - def __setitem__(self,n,v): - self.slice[n].value = v - - def __len__(self): - return len(self.slice) - - def lineno(self,n): - return getattr(self.slice[n],"lineno",0) - - def linespan(self,n): - startline = getattr(self.slice[n],"lineno",0) - endline = getattr(self.slice[n],"endlineno",startline) - return startline,endline - - def pushback(self,n): - if n <= 0: - raise ValueError, "Expected a positive value" - if n > (len(self.slice)-1): - raise ValueError, "Can't push %d tokens. Only %d are available." % (n,len(self.slice)-1) - for i in range(0,n): - self.pbstack.append(self.slice[-i-1]) - -# The LR Parsing engine. This is defined as a class so that multiple parsers -# can exist in the same process. A user never instantiates this directly. -# Instead, the global yacc() function should be used to create a suitable Parser -# object. - -class Parser: - def __init__(self,magic=None): - - # This is a hack to keep users from trying to instantiate a Parser - # object directly. - - if magic != "xyzzy": - raise YaccError, "Can't instantiate Parser. Use yacc() instead." - - # Reset internal state - self.productions = None # List of productions - self.errorfunc = None # Error handling function - self.action = { } # LR Action table - self.goto = { } # LR goto table - self.require = { } # Attribute require table - self.method = "Unknown LR" # Table construction method used - - def errok(self): - self.errorcount = 0 - - def restart(self): - del self.statestack[:] - del self.symstack[:] - sym = YaccSymbol() - sym.type = '$' - self.symstack.append(sym) - self.statestack.append(0) - - def parse(self,input=None,lexer=None,debug=0): - lookahead = None # Current lookahead symbol - lookaheadstack = [ ] # Stack of lookahead symbols - actions = self.action # Local reference to action table - goto = self.goto # Local reference to goto table - prod = self.productions # Local reference to production list - pslice = YaccProduction(None) # Production object passed to grammar rules - pslice.parser = self # Parser object - self.errorcount = 0 # Used during error recovery - - # If no lexer was given, we will try to use the lex module - if not lexer: - import lex as lexer - - pslice.lexer = lexer - - # If input was supplied, pass to lexer - if input: - lexer.input(input) - - # Tokenize function - get_token = lexer.token - - statestack = [ ] # Stack of parsing states - self.statestack = statestack - symstack = [ ] # Stack of grammar symbols - self.symstack = symstack - - errtoken = None # Err token - - # The start state is assumed to be (0,$) - statestack.append(0) - sym = YaccSymbol() - sym.type = '$' - symstack.append(sym) - - while 1: - # Get the next symbol on the input. If a lookahead symbol - # is already set, we just use that. Otherwise, we'll pull - # the next token off of the lookaheadstack or from the lexer - if not lookahead: - if not lookaheadstack: - lookahead = get_token() # Get the next token - else: - lookahead = lookaheadstack.pop() - if not lookahead: - lookahead = YaccSymbol() - lookahead.type = '$' - if debug: - errorlead = ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip() - - # Check the action table - s = statestack[-1] - ltype = lookahead.type - t = actions.get((s,ltype),None) - - if t is not None: - if t > 0: - # shift a symbol on the stack - if ltype == '$': - # Error, end of input - sys.stderr.write("yacc: Parse error. EOF\n") - return - statestack.append(t) - if debug > 1: - sys.stderr.write("%-60s shift state %s\n" % (errorlead, t)) - symstack.append(lookahead) - lookahead = None - - # Decrease error count on successful shift - if self.errorcount > 0: - self.errorcount -= 1 - - continue - - if t < 0: - # reduce a symbol on the stack, emit a production - p = prod[-t] - pname = p.name - plen = p.len - - # Get production function - sym = YaccSymbol() - sym.type = pname # Production name - sym.value = None - if debug > 1: - sys.stderr.write("%-60s reduce %d\n" % (errorlead, -t)) - - if plen: - targ = symstack[-plen-1:] - targ[0] = sym - try: - sym.lineno = targ[1].lineno - sym.endlineno = getattr(targ[-1],"endlineno",targ[-1].lineno) - except AttributeError: - sym.lineno = 0 - del symstack[-plen:] - del statestack[-plen:] - else: - sym.lineno = 0 - targ = [ sym ] - pslice.slice = targ - pslice.pbstack = [] - # Call the grammar rule with our special slice object - p.func(pslice) - - # If there was a pushback, put that on the stack - if pslice.pbstack: - lookaheadstack.append(lookahead) - for _t in pslice.pbstack: - lookaheadstack.append(_t) - lookahead = None - - symstack.append(sym) - statestack.append(goto[statestack[-1],pname]) - continue - - if t == 0: - n = symstack[-1] - return getattr(n,"value",None) - sys.stderr.write(errorlead, "\n") - - if t == None: - if debug: - sys.stderr.write(errorlead + "\n") - # We have some kind of parsing error here. To handle - # this, we are going to push the current token onto - # the tokenstack and replace it with an 'error' token. - # If there are any synchronization rules, they may - # catch it. - # - # In addition to pushing the error token, we call call - # the user defined p_error() function if this is the - # first syntax error. This function is only called if - # errorcount == 0. - if not self.errorcount: - self.errorcount = error_count - errtoken = lookahead - if errtoken.type == '$': - errtoken = None # End of file! - if self.errorfunc: - global errok,token,restart - errok = self.errok # Set some special functions available in error recovery - token = get_token - restart = self.restart - tok = self.errorfunc(errtoken) - del errok, token, restart # Delete special functions - - if not self.errorcount: - # User must have done some kind of panic - # mode recovery on their own. The - # returned token is the next lookahead - lookahead = tok - errtoken = None - continue - else: - if errtoken: - if hasattr(errtoken,"lineno"): lineno = lookahead.lineno - else: lineno = 0 - if lineno: - sys.stderr.write("yacc: Syntax error at line %d, token=%s\n" % (lineno, errtoken.type)) - else: - sys.stderr.write("yacc: Syntax error, token=%s" % errtoken.type) - else: - sys.stderr.write("yacc: Parse error in input. EOF\n") - return - - else: - self.errorcount = error_count - - # case 1: the statestack only has 1 entry on it. If we're in this state, the - # entire parse has been rolled back and we're completely hosed. The token is - # discarded and we just keep going. - - if len(statestack) <= 1 and lookahead.type != '$': - lookahead = None - errtoken = None - # Nuke the pushback stack - del lookaheadstack[:] - continue - - # case 2: the statestack has a couple of entries on it, but we're - # at the end of the file. nuke the top entry and generate an error token - - # Start nuking entries on the stack - if lookahead.type == '$': - # Whoa. We're really hosed here. Bail out - return - - if lookahead.type != 'error': - sym = symstack[-1] - if sym.type == 'error': - # Hmmm. Error is on top of stack, we'll just nuke input - # symbol and continue - lookahead = None - continue - t = YaccSymbol() - t.type = 'error' - if hasattr(lookahead,"lineno"): - t.lineno = lookahead.lineno - t.value = lookahead - lookaheadstack.append(lookahead) - lookahead = t - else: - symstack.pop() - statestack.pop() - - continue - - # Call an error function here - raise RuntimeError, "yacc: internal parser error!!!\n" - -# ----------------------------------------------------------------------------- -# === Parser Construction === -# -# The following functions and variables are used to implement the yacc() function -# itself. This is pretty hairy stuff involving lots of error checking, -# construction of LR items, kernels, and so forth. Although a lot of -# this work is done using global variables, the resulting Parser object -# is completely self contained--meaning that it is safe to repeatedly -# call yacc() with different grammars in the same application. -# ----------------------------------------------------------------------------- - -# ----------------------------------------------------------------------------- -# validate_file() -# -# This function checks to see if there are duplicated p_rulename() functions -# in the parser module file. Without this function, it is really easy for -# users to make mistakes by cutting and pasting code fragments (and it's a real -# bugger to try and figure out why the resulting parser doesn't work). Therefore, -# we just do a little regular expression pattern matching of def statements -# to try and detect duplicates. -# ----------------------------------------------------------------------------- - -def validate_file(filename): - base,ext = os.path.splitext(filename) - if ext != '.py': return 1 # No idea. Assume it's okay. - - try: - f = open(filename) - lines = f.readlines() - f.close() - except IOError: - return 1 # Oh well - - # Match def p_funcname( - fre = re.compile(r'\s*def\s+(p_[a-zA-Z_0-9]*)\(') - counthash = { } - linen = 1 - noerror = 1 - for l in lines: - m = fre.match(l) - if m: - name = m.group(1) - prev = counthash.get(name) - if not prev: - counthash[name] = linen - else: - sys.stderr.write("%s:%d: Function %s redefined. Previously defined on line %d\n" % (filename,linen,name,prev)) - noerror = 0 - linen += 1 - return noerror - -# This function looks for functions that might be grammar rules, but which don't have the proper p_suffix. -def validate_dict(d): - for n,v in d.items(): - if n[0:2] == 'p_' and type(v) in (types.FunctionType, types.MethodType): continue - if n[0:2] == 't_': continue - - if n[0:2] == 'p_': - sys.stderr.write("yacc: Warning. '%s' not defined as a function\n" % n) - if 1 and isinstance(v,types.FunctionType) and v.func_code.co_argcount == 1: - try: - doc = v.__doc__.split(" ") - if doc[1] == ':': - sys.stderr.write("%s:%d: Warning. Possible grammar rule '%s' defined without p_ prefix.\n" % (v.func_code.co_filename, v.func_code.co_firstlineno,n)) - except StandardError: - pass - -# ----------------------------------------------------------------------------- -# === GRAMMAR FUNCTIONS === -# -# The following global variables and functions are used to store, manipulate, -# and verify the grammar rules specified by the user. -# ----------------------------------------------------------------------------- - -# Initialize all of the global variables used during grammar construction -def initialize_vars(): - global Productions, Prodnames, Prodmap, Terminals - global Nonterminals, First, Follow, Precedence, LRitems - global Errorfunc, Signature, Requires - - # LALR(1) globals - global Prodempty, TReductions, NTReductions, GotoSetNum, Canonical - - Productions = [None] # A list of all of the productions. The first - # entry is always reserved for the purpose of - # building an augmented grammar - - Prodnames = { } # A dictionary mapping the names of nonterminals to a list of all - # productions of that nonterminal. - - Prodmap = { } # A dictionary that is only used to detect duplicate - # productions. - - Terminals = { } # A dictionary mapping the names of terminal symbols to a - # list of the rules where they are used. - - Nonterminals = { } # A dictionary mapping names of nonterminals to a list - # of rule numbers where they are used. - - First = { } # A dictionary of precomputed FIRST(x) symbols - - Follow = { } # A dictionary of precomputed FOLLOW(x) symbols - - Precedence = { } # Precedence rules for each terminal. Contains tuples of the - # form ('right',level) or ('nonassoc', level) or ('left',level) - - LRitems = [ ] # A list of all LR items for the grammar. These are the - # productions with the "dot" like E -> E . PLUS E - - Errorfunc = None # User defined error handler - - Signature = md5.new() # Digital signature of the grammar rules, precedence - # and other information. Used to determined when a - # parsing table needs to be regenerated. - - Requires = { } # Requires list - - # LALR(1) Initialization - Prodempty = { } # A dictionary of all productions that have an empty rule - # of the form P : - - TReductions = { } # A dictionary of precomputer reductions from - # nonterminals to terminals - - NTReductions = { } # A dictionary of precomputed reductions from - # nonterminals to nonterminals - - GotoSetNum = { } # A dictionary that remembers goto sets based on - # the state number and symbol - - Canonical = { } # A list of LR item sets. A LR item set is a list of LR - # items that represent the state of the parser - - # File objects used when creating the parser.out debugging file - global _vf, _vfc - _vf = cStringIO.StringIO() - _vfc = cStringIO.StringIO() - -# ----------------------------------------------------------------------------- -# class Production: -# -# This class stores the raw information about a single production or grammar rule. -# It has a few required attributes: -# -# name - Name of the production (nonterminal) -# prod - A list of symbols making up its production -# number - Production number. -# -# In addition, a few additional attributes are used to help with debugging or -# optimization of table generation. -# -# file - File where production action is defined. -# lineno - Line number where action is defined -# func - Action function -# prec - Precedence level -# lr_next - Next LR item. Example, if we are ' E -> E . PLUS E' -# then lr_next refers to 'E -> E PLUS . E' -# lr_index - LR item index (location of the ".") in the prod list. -# lookaheads - LALR lookahead symbols for this item -# len - Length of the production (number of symbols on right hand side) -# ----------------------------------------------------------------------------- - -class Production: - def __init__(self,**kw): - for k,v in kw.items(): - setattr(self,k,v) - self.lr_index = -1 - self.lr0_added = 0 # Flag indicating whether or not added to LR0 closure - self.lr1_added = 0 # Flag indicating whether or not added to LR1 - self.usyms = [ ] - self.lookaheads = { } - self.lk_added = 0 - self.setnumbers = [ ] - - def __str__(self): - if self.prod: - s = "%s -> %s" % (self.name," ".join(self.prod)) - else: - s = "%s -> " % self.name - return s - - def __repr__(self): - return str(self) - - # Compute lr_items from the production - def lr_item(self,n): - if n > len(self.prod): return None - p = Production() - p.name = self.name - p.prod = list(self.prod) - p.number = self.number - p.lr_index = n - p.lookaheads = { } - p.setnumbers = self.setnumbers - p.prod.insert(n,".") - p.prod = tuple(p.prod) - p.len = len(p.prod) - p.usyms = self.usyms - - # Precompute list of productions immediately following - try: - p.lrafter = Prodnames[p.prod[n+1]] - except (IndexError,KeyError),e: - p.lrafter = [] - try: - p.lrbefore = p.prod[n-1] - except IndexError: - p.lrbefore = None - - return p - -class MiniProduction: - pass - -# Utility function -def is_identifier(s): - for c in s: - if not (c.isalnum() or c == '_'): return 0 - return 1 - -# ----------------------------------------------------------------------------- -# add_production() -# -# Given an action function, this function assembles a production rule. -# The production rule is assumed to be found in the function's docstring. -# This rule has the general syntax: -# -# name1 ::= production1 -# | production2 -# | production3 -# ... -# | productionn -# name2 ::= production1 -# | production2 -# ... -# ----------------------------------------------------------------------------- - -def add_production(f,file,line,prodname,syms): - - if Terminals.has_key(prodname): - sys.stderr.write("%s:%d: Illegal rule name '%s'. Already defined as a token.\n" % (file,line,prodname)) - return -1 - if prodname == 'error': - sys.stderr.write("%s:%d: Illegal rule name '%s'. error is a reserved word.\n" % (file,line,prodname)) - return -1 - - if not is_identifier(prodname): - sys.stderr.write("%s:%d: Illegal rule name '%s'\n" % (file,line,prodname)) - return -1 - - for s in syms: - if not is_identifier(s) and s != '%prec': - sys.stderr.write("%s:%d: Illegal name '%s' in rule '%s'\n" % (file,line,s, prodname)) - return -1 - - # See if the rule is already in the rulemap - map = "%s -> %s" % (prodname,syms) - if Prodmap.has_key(map): - m = Prodmap[map] - sys.stderr.write("%s:%d: Duplicate rule %s.\n" % (file,line, m)) - sys.stderr.write("%s:%d: Previous definition at %s:%d\n" % (file,line, m.file, m.line)) - return -1 - - p = Production() - p.name = prodname - p.prod = syms - p.file = file - p.line = line - p.func = f - p.number = len(Productions) - - - Productions.append(p) - Prodmap[map] = p - if not Nonterminals.has_key(prodname): - Nonterminals[prodname] = [ ] - - # Add all terminals to Terminals - i = 0 - while i < len(p.prod): - t = p.prod[i] - if t == '%prec': - try: - precname = p.prod[i+1] - except IndexError: - sys.stderr.write("%s:%d: Syntax error. Nothing follows %%prec.\n" % (p.file,p.line)) - return -1 - - prec = Precedence.get(precname,None) - if not prec: - sys.stderr.write("%s:%d: Nothing known about the precedence of '%s'\n" % (p.file,p.line,precname)) - return -1 - else: - p.prec = prec - del p.prod[i] - del p.prod[i] - continue - - if Terminals.has_key(t): - Terminals[t].append(p.number) - # Is a terminal. We'll assign a precedence to p based on this - if not hasattr(p,"prec"): - p.prec = Precedence.get(t,('right',0)) - else: - if not Nonterminals.has_key(t): - Nonterminals[t] = [ ] - Nonterminals[t].append(p.number) - i += 1 - - if not hasattr(p,"prec"): - p.prec = ('right',0) - - # Set final length of productions - p.len = len(p.prod) - p.prod = tuple(p.prod) - - # Calculate unique syms in the production - p.usyms = [ ] - for s in p.prod: - if s not in p.usyms: - p.usyms.append(s) - - # Add to the global productions list - try: - Prodnames[p.name].append(p) - except KeyError: - Prodnames[p.name] = [ p ] - return 0 - -# Given a raw rule function, this function rips out its doc string -# and adds rules to the grammar - -def add_function(f): - line = f.func_code.co_firstlineno - file = f.func_code.co_filename - error = 0 - - if isinstance(f,types.MethodType): - reqdargs = 2 - else: - reqdargs = 1 - - if f.func_code.co_argcount > reqdargs: - sys.stderr.write("%s:%d: Rule '%s' has too many arguments.\n" % (file,line,f.__name__)) - return -1 - - if f.func_code.co_argcount < reqdargs: - sys.stderr.write("%s:%d: Rule '%s' requires an argument.\n" % (file,line,f.__name__)) - return -1 - - if f.__doc__: - # Split the doc string into lines - pstrings = f.__doc__.splitlines() - lastp = None - dline = line - for ps in pstrings: - dline += 1 - p = ps.split() - if not p: continue - try: - if p[0] == '|': - # This is a continuation of a previous rule - if not lastp: - sys.stderr.write("%s:%d: Misplaced '|'.\n" % (file,dline)) - return -1 - prodname = lastp - if len(p) > 1: - syms = p[1:] - else: - syms = [ ] - else: - prodname = p[0] - lastp = prodname - assign = p[1] - if len(p) > 2: - syms = p[2:] - else: - syms = [ ] - if assign != ':' and assign != '::=': - sys.stderr.write("%s:%d: Syntax error. Expected ':'\n" % (file,dline)) - return -1 - e = add_production(f,file,dline,prodname,syms) - error += e - except StandardError: - sys.stderr.write("%s:%d: Syntax error in rule '%s'\n" % (file,dline,ps)) - error -= 1 - else: - sys.stderr.write("%s:%d: No documentation string specified in function '%s'\n" % (file,line,f.__name__)) - return error - - -# Cycle checking code (Michael Dyck) - -def compute_reachable(): - ''' - Find each symbol that can be reached from the start symbol. - Print a warning for any nonterminals that can't be reached. - (Unused terminals have already had their warning.) - ''' - Reachable = { } - for s in Terminals.keys() + Nonterminals.keys(): - Reachable[s] = 0 - - mark_reachable_from( Productions[0].prod[0], Reachable ) - - for s in Nonterminals.keys(): - if not Reachable[s]: - sys.stderr.write("yacc: Symbol '%s' is unreachable.\n" % s) - -def mark_reachable_from(s, Reachable): - ''' - Mark all symbols that are reachable from symbol s. - ''' - if Reachable[s]: - # We've already reached symbol s. - return - Reachable[s] = 1 - for p in Prodnames.get(s,[]): - for r in p.prod: - mark_reachable_from(r, Reachable) - -# ----------------------------------------------------------------------------- -# compute_terminates() -# -# This function looks at the various parsing rules and tries to detect -# infinite recursion cycles (grammar rules where there is no possible way -# to derive a string of only terminals). -# ----------------------------------------------------------------------------- -def compute_terminates(): - ''' - Raise an error for any symbols that don't terminate. - ''' - Terminates = {} - - # Terminals: - for t in Terminals.keys(): - Terminates[t] = 1 - - Terminates['$'] = 1 - - # Nonterminals: - - # Initialize to false: - for n in Nonterminals.keys(): - Terminates[n] = 0 - - # Then propagate termination until no change: - while 1: - some_change = 0 - for (n,pl) in Prodnames.items(): - # Nonterminal n terminates iff any of its productions terminates. - for p in pl: - # Production p terminates iff all of its rhs symbols terminate. - for s in p.prod: - if not Terminates[s]: - # The symbol s does not terminate, - # so production p does not terminate. - p_terminates = 0 - break - else: - # didn't break from the loop, - # so every symbol s terminates - # so production p terminates. - p_terminates = 1 - - if p_terminates: - # symbol n terminates! - if not Terminates[n]: - Terminates[n] = 1 - some_change = 1 - # Don't need to consider any more productions for this n. - break - - if not some_change: - break - - some_error = 0 - for (s,terminates) in Terminates.items(): - if not terminates: - if not Prodnames.has_key(s) and not Terminals.has_key(s) and s != 'error': - # s is used-but-not-defined, and we've already warned of that, - # so it would be overkill to say that it's also non-terminating. - pass - else: - sys.stderr.write("yacc: Infinite recursion detected for symbol '%s'.\n" % s) - some_error = 1 - - return some_error - -# ----------------------------------------------------------------------------- -# verify_productions() -# -# This function examines all of the supplied rules to see if they seem valid. -# ----------------------------------------------------------------------------- -def verify_productions(cycle_check=1): - error = 0 - for p in Productions: - if not p: continue - - for s in p.prod: - if not Prodnames.has_key(s) and not Terminals.has_key(s) and s != 'error': - sys.stderr.write("%s:%d: Symbol '%s' used, but not defined as a token or a rule.\n" % (p.file,p.line,s)) - error = 1 - continue - - unused_tok = 0 - # Now verify all of the tokens - if yaccdebug: - _vf.write("Unused terminals:\n\n") - for s,v in Terminals.items(): - if s != 'error' and not v: - sys.stderr.write("yacc: Warning. Token '%s' defined, but not used.\n" % s) - if yaccdebug: _vf.write(" %s\n"% s) - unused_tok += 1 - - # Print out all of the productions - if yaccdebug: - _vf.write("\nGrammar\n\n") - for i in range(1,len(Productions)): - _vf.write("Rule %-5d %s\n" % (i, Productions[i])) - - unused_prod = 0 - # Verify the use of all productions - for s,v in Nonterminals.items(): - if not v: - p = Prodnames[s][0] - sys.stderr.write("%s:%d: Warning. Rule '%s' defined, but not used.\n" % (p.file,p.line, s)) - unused_prod += 1 - - - if unused_tok == 1: - sys.stderr.write("yacc: Warning. There is 1 unused token.\n") - if unused_tok > 1: - sys.stderr.write("yacc: Warning. There are %d unused tokens.\n" % unused_tok) - - if unused_prod == 1: - sys.stderr.write("yacc: Warning. There is 1 unused rule.\n") - if unused_prod > 1: - sys.stderr.write("yacc: Warning. There are %d unused rules.\n" % unused_prod) - - if yaccdebug: - _vf.write("\nTerminals, with rules where they appear\n\n") - ks = Terminals.keys() - ks.sort() - for k in ks: - _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Terminals[k]]))) - _vf.write("\nNonterminals, with rules where they appear\n\n") - ks = Nonterminals.keys() - ks.sort() - for k in ks: - _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Nonterminals[k]]))) - - if (cycle_check): - compute_reachable() - error += compute_terminates() -# error += check_cycles() - return error - -# ----------------------------------------------------------------------------- -# build_lritems() -# -# This function walks the list of productions and builds a complete set of the -# LR items. The LR items are stored in two ways: First, they are uniquely -# numbered and placed in the list _lritems. Second, a linked list of LR items -# is built for each production. For example: -# -# E -> E PLUS E -# -# Creates the list -# -# [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ] -# ----------------------------------------------------------------------------- - -def build_lritems(): - for p in Productions: - lastlri = p - lri = p.lr_item(0) - i = 0 - while 1: - lri = p.lr_item(i) - lastlri.lr_next = lri - if not lri: break - lri.lr_num = len(LRitems) - LRitems.append(lri) - lastlri = lri - i += 1 - - # In order for the rest of the parser generator to work, we need to - # guarantee that no more lritems are generated. Therefore, we nuke - # the p.lr_item method. (Only used in debugging) - # Production.lr_item = None - -# ----------------------------------------------------------------------------- -# add_precedence() -# -# Given a list of precedence rules, add to the precedence table. -# ----------------------------------------------------------------------------- - -def add_precedence(plist): - plevel = 0 - error = 0 - for p in plist: - plevel += 1 - try: - prec = p[0] - terms = p[1:] - if prec != 'left' and prec != 'right' and prec != 'nonassoc': - sys.stderr.write("yacc: Invalid precedence '%s'\n" % prec) - return -1 - for t in terms: - if Precedence.has_key(t): - sys.stderr.write("yacc: Precedence already specified for terminal '%s'\n" % t) - error += 1 - continue - Precedence[t] = (prec,plevel) - except: - sys.stderr.write("yacc: Invalid precedence table.\n") - error += 1 - - return error - -# ----------------------------------------------------------------------------- -# augment_grammar() -# -# Compute the augmented grammar. This is just a rule S' -> start where start -# is the starting symbol. -# ----------------------------------------------------------------------------- - -def augment_grammar(start=None): - if not start: - start = Productions[1].name - Productions[0] = Production(name="S'",prod=[start],number=0,len=1,prec=('right',0),func=None) - Productions[0].usyms = [ start ] - Nonterminals[start].append(0) - - -# ------------------------------------------------------------------------- -# first() -# -# Compute the value of FIRST1(beta) where beta is a tuple of symbols. -# -# During execution of compute_first1, the result may be incomplete. -# Afterward (e.g., when called from compute_follow()), it will be complete. -# ------------------------------------------------------------------------- -def first(beta): - - # We are computing First(x1,x2,x3,...,xn) - result = [ ] - for x in beta: - x_produces_empty = 0 - - # Add all the non- symbols of First[x] to the result. - for f in First[x]: - if f == '': - x_produces_empty = 1 - else: - if f not in result: result.append(f) - - if x_produces_empty: - # We have to consider the next x in beta, - # i.e. stay in the loop. - pass - else: - # We don't have to consider any further symbols in beta. - break - else: - # There was no 'break' from the loop, - # so x_produces_empty was true for all x in beta, - # so beta produces empty as well. - result.append('') - - return result - - -# FOLLOW(x) -# Given a non-terminal. This function computes the set of all symbols -# that might follow it. Dragon book, p. 189. - -def compute_follow(start=None): - # Add '$' to the follow list of the start symbol - for k in Nonterminals.keys(): - Follow[k] = [ ] - - if not start: - start = Productions[1].name - - Follow[start] = [ '$' ] - - while 1: - didadd = 0 - for p in Productions[1:]: - # Here is the production set - for i in range(len(p.prod)): - B = p.prod[i] - if Nonterminals.has_key(B): - # Okay. We got a non-terminal in a production - fst = first(p.prod[i+1:]) - hasempty = 0 - for f in fst: - if f != '' and f not in Follow[B]: - Follow[B].append(f) - didadd = 1 - if f == '': - hasempty = 1 - if hasempty or i == (len(p.prod)-1): - # Add elements of follow(a) to follow(b) - for f in Follow[p.name]: - if f not in Follow[B]: - Follow[B].append(f) - didadd = 1 - if not didadd: break - - if 0 and yaccdebug: - _vf.write('\nFollow:\n') - for k in Nonterminals.keys(): - _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Follow[k]]))) - -# ------------------------------------------------------------------------- -# compute_first1() -# -# Compute the value of FIRST1(X) for all symbols -# ------------------------------------------------------------------------- -def compute_first1(): - - # Terminals: - for t in Terminals.keys(): - First[t] = [t] - - First['$'] = ['$'] - First['#'] = ['#'] # what's this for? - - # Nonterminals: - - # Initialize to the empty set: - for n in Nonterminals.keys(): - First[n] = [] - - # Then propagate symbols until no change: - while 1: - some_change = 0 - for n in Nonterminals.keys(): - for p in Prodnames[n]: - for f in first(p.prod): - if f not in First[n]: - First[n].append( f ) - some_change = 1 - if not some_change: - break - - if 0 and yaccdebug: - _vf.write('\nFirst:\n') - for k in Nonterminals.keys(): - _vf.write("%-20s : %s\n" % - (k, " ".join([str(s) for s in First[k]]))) - -# ----------------------------------------------------------------------------- -# === SLR Generation === -# -# The following functions are used to construct SLR (Simple LR) parsing tables -# as described on p.221-229 of the dragon book. -# ----------------------------------------------------------------------------- - -# Global variables for the LR parsing engine -def lr_init_vars(): - global _lr_action, _lr_goto, _lr_method - global _lr_goto_cache - - _lr_action = { } # Action table - _lr_goto = { } # Goto table - _lr_method = "Unknown" # LR method used - _lr_goto_cache = { } - -# Compute the LR(0) closure operation on I, where I is a set of LR(0) items. -# prodlist is a list of productions. - -_add_count = 0 # Counter used to detect cycles - -def lr0_closure(I): - global _add_count - - _add_count += 1 - prodlist = Productions - - # Add everything in I to J - J = I[:] - didadd = 1 - while didadd: - didadd = 0 - for j in J: - for x in j.lrafter: - if x.lr0_added == _add_count: continue - # Add B --> .G to J - J.append(x.lr_next) - x.lr0_added = _add_count - didadd = 1 - - return J - -# Compute the LR(0) goto function goto(I,X) where I is a set -# of LR(0) items and X is a grammar symbol. This function is written -# in a way that guarantees uniqueness of the generated goto sets -# (i.e. the same goto set will never be returned as two different Python -# objects). With uniqueness, we can later do fast set comparisons using -# id(obj) instead of element-wise comparison. - -def lr0_goto(I,x): - # First we look for a previously cached entry - g = _lr_goto_cache.get((id(I),x),None) - if g: return g - - # Now we generate the goto set in a way that guarantees uniqueness - # of the result - - s = _lr_goto_cache.get(x,None) - if not s: - s = { } - _lr_goto_cache[x] = s - - gs = [ ] - for p in I: - n = p.lr_next - if n and n.lrbefore == x: - s1 = s.get(id(n),None) - if not s1: - s1 = { } - s[id(n)] = s1 - gs.append(n) - s = s1 - g = s.get('$',None) - if not g: - if gs: - g = lr0_closure(gs) - s['$'] = g - else: - s['$'] = gs - _lr_goto_cache[(id(I),x)] = g - return g - -# Added for LALR(1) - -# Given a setnumber of an lr0 state and a symbol return the setnumber of the goto state -def lr0_goto_setnumber(I_setnumber, x): - global Canonical - global GotoSetNum - - if GotoSetNum.has_key((I_setnumber, x)): - setnumber = GotoSetNum[(I_setnumber, x)] - else: - gset = lr0_goto(Canonical[I_setnumber], x) - if not gset: - return -1 - else: - gsetlen = len(gset) - for i in xrange(len(gset[0].setnumbers)): - inall = 1 - for item in gset: - if not item.setnumbers[i]: - inall = 0 - break - if inall and len(Canonical[i]) == gsetlen: - setnumber = i - break # Note: DB. I added this to improve performance. - # Not sure if this breaks the algorithm (it doesn't appear to). - - GotoSetNum[(I_setnumber, x)] = setnumber - - return setnumber - -# Compute the kernel of a set of LR(0) items -def lr0_kernel(I): - KI = [ ] - for p in I: - if p.name == "S'" or p.lr_index > 0 or p.len == 0: - KI.append(p) - - return KI - -_lr0_cidhash = { } - -# Compute the LR(0) sets of item function -def lr0_items(): - - C = [ lr0_closure([Productions[0].lr_next]) ] - i = 0 - for I in C: - _lr0_cidhash[id(I)] = i - i += 1 - - # Loop over the items in C and each grammar symbols - i = 0 - while i < len(C): - I = C[i] - i += 1 - - # Collect all of the symbols that could possibly be in the goto(I,X) sets - asyms = { } - for ii in I: - for s in ii.usyms: - asyms[s] = None - - for x in asyms.keys(): - g = lr0_goto(I,x) - if not g: continue - if _lr0_cidhash.has_key(id(g)): continue - _lr0_cidhash[id(g)] = len(C) - C.append(g) - - return C - -# ----------------------------------------------------------------------------- -# slr_parse_table() -# -# This function constructs an SLR table. -# ----------------------------------------------------------------------------- -def slr_parse_table(): - global _lr_method - goto = _lr_goto # Goto array - action = _lr_action # Action array - actionp = { } # Action production array (temporary) - - _lr_method = "SLR" - - n_srconflict = 0 - n_rrconflict = 0 - - if yaccdebug: - sys.stderr.write("yacc: Generating SLR parsing table...\n") - _vf.write("\n\nParsing method: SLR\n\n") - - # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items - # This determines the number of states - - C = lr0_items() - - # Build the parser table, state by state - st = 0 - for I in C: - # Loop over each production in I - actlist = [ ] # List of actions - - if yaccdebug: - _vf.write("\nstate %d\n\n" % st) - for p in I: - _vf.write(" (%d) %s\n" % (p.number, str(p))) - _vf.write("\n") - - for p in I: - try: - if p.prod[-1] == ".": - if p.name == "S'": - # Start symbol. Accept! - action[st,"$"] = 0 - actionp[st,"$"] = p - else: - # We are at the end of a production. Reduce! - for a in Follow[p.name]: - actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p))) - r = action.get((st,a),None) - if r is not None: - # Whoa. Have a shift/reduce or reduce/reduce conflict - if r > 0: - # Need to decide on shift or reduce here - # By default we favor shifting. Need to add - # some precedence rules here. - sprec,slevel = Productions[actionp[st,a].number].prec - rprec,rlevel = Precedence.get(a,('right',0)) - if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): - # We really need to reduce here. - action[st,a] = -p.number - actionp[st,a] = p - if not slevel and not rlevel: - _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) - n_srconflict += 1 - elif (slevel == rlevel) and (rprec == 'nonassoc'): - action[st,a] = None - else: - # Hmmm. Guess we'll keep the shift - if not slevel and not rlevel: - _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) - n_srconflict +=1 - elif r < 0: - # Reduce/reduce conflict. In this case, we favor the rule - # that was defined first in the grammar file - oldp = Productions[-r] - pp = Productions[p.number] - if oldp.line > pp.line: - action[st,a] = -p.number - actionp[st,a] = p - # sys.stderr.write("Reduce/reduce conflict in state %d\n" % st) - n_rrconflict += 1 - _vfc.write("reduce/reduce conflict in state %d resolved using rule %d (%s).\n" % (st, actionp[st,a].number, actionp[st,a])) - _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d (%s).\n" % (a,actionp[st,a].number, actionp[st,a])) - else: - sys.stderr.write("Unknown conflict in state %d\n" % st) - else: - action[st,a] = -p.number - actionp[st,a] = p - else: - i = p.lr_index - a = p.prod[i+1] # Get symbol right after the "." - if Terminals.has_key(a): - g = lr0_goto(I,a) - j = _lr0_cidhash.get(id(g),-1) - if j >= 0: - # We are in a shift state - actlist.append((a,p,"shift and go to state %d" % j)) - r = action.get((st,a),None) - if r is not None: - # Whoa have a shift/reduce or shift/shift conflict - if r > 0: - if r != j: - sys.stderr.write("Shift/shift conflict in state %d\n" % st) - elif r < 0: - # Do a precedence check. - # - if precedence of reduce rule is higher, we reduce. - # - if precedence of reduce is same and left assoc, we reduce. - # - otherwise we shift - rprec,rlevel = Productions[actionp[st,a].number].prec - sprec,slevel = Precedence.get(a,('right',0)) - if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')): - # We decide to shift here... highest precedence to shift - action[st,a] = j - actionp[st,a] = p - if not slevel and not rlevel: - n_srconflict += 1 - _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) - elif (slevel == rlevel) and (rprec == 'nonassoc'): - action[st,a] = None - else: - # Hmmm. Guess we'll keep the reduce - if not slevel and not rlevel: - n_srconflict +=1 - _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) - - else: - sys.stderr.write("Unknown conflict in state %d\n" % st) - else: - action[st,a] = j - actionp[st,a] = p - - except StandardError,e: - raise YaccError, "Hosed in slr_parse_table", e - - # Print the actions associated with each terminal - if yaccdebug: - _actprint = { } - for a,p,m in actlist: - if action.has_key((st,a)): - if p is actionp[st,a]: - _vf.write(" %-15s %s\n" % (a,m)) - _actprint[(a,m)] = 1 - _vf.write("\n") - for a,p,m in actlist: - if action.has_key((st,a)): - if p is not actionp[st,a]: - if not _actprint.has_key((a,m)): - _vf.write(" ! %-15s [ %s ]\n" % (a,m)) - _actprint[(a,m)] = 1 - - # Construct the goto table for this state - if yaccdebug: - _vf.write("\n") - nkeys = { } - for ii in I: - for s in ii.usyms: - if Nonterminals.has_key(s): - nkeys[s] = None - for n in nkeys.keys(): - g = lr0_goto(I,n) - j = _lr0_cidhash.get(id(g),-1) - if j >= 0: - goto[st,n] = j - if yaccdebug: - _vf.write(" %-30s shift and go to state %d\n" % (n,j)) - - st += 1 - - if yaccdebug: - if n_srconflict == 1: - sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict) - if n_srconflict > 1: - sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict) - if n_rrconflict == 1: - sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict) - if n_rrconflict > 1: - sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict) - - - -# ----------------------------------------------------------------------------- -# ==== LALR(1) Parsing ==== -# FINISHED! 5/20/2003 by Elias Ioup -# ----------------------------------------------------------------------------- - - -# Compute the lr1_closure of a set I. I is a list of productions and setnumber -# is the state that you want the lr items that are made from the to come from. - -_lr1_add_count = 0 - -def lr1_closure(I, setnumber = 0): - global _add_count - global Nonterminals - - _add_count += 1 - prodlist = Productions - - # Add everything in I to J - J = I[:] - Jhash = { } - for j in J: - Jhash[id(j)] = 1 - - didadd = 1 - while didadd: - didadd = 0 - for j in J: - jprod = j.prod - jlr_index = j.lr_index - jprodslice = jprod[jlr_index+2:] - - if jlr_index < len(jprod) - 1 and Nonterminals.has_key(jprod[jlr_index+1]): - first_syms = [] - if j.lk_added < len(j.lookaheads[setnumber]): - for a in j.lookaheads[setnumber][j.lk_added:]: - # find b in FIRST(Xa) if j = [A->a.BX,a] - temp_first_syms = first(jprodslice + (a,)) - for x in temp_first_syms: - if x not in first_syms: - first_syms.append(x) - - j.lk_added = len(j.lookaheads[setnumber]) - - for x in j.lrafter: - - # Add B --> .G to J - if x.lr_next.lookaheads.has_key(setnumber): - _xlook = x.lr_next.lookaheads[setnumber] - for s in first_syms: - if s not in _xlook: - _xlook.append(s) - didadd = 1 - else: - didadd = 0 - else: - x.lr_next.lookaheads[setnumber] = first_syms - didadd = 1 - - nid = id(x.lr_next) - if not Jhash.has_key(nid): - J.append(x.lr_next) - Jhash[nid] = 1 - - return J - -def add_lookaheads(K): - spontaneous = [] - propogate = [] - - for setnumber in range(len(K)): - for kitem in K[setnumber]: - kitem.lookaheads[setnumber] = ['#'] - J = lr1_closure([kitem], setnumber) - - # find the lookaheads that are spontaneously created from closures - # and the propogations of lookaheads between lr items - for item in J: - if item.lr_index < len(item.prod)-1: - for lookahead in item.lookaheads[setnumber]: - if lookahead != '#': - goto_setnumber = lr0_goto_setnumber(setnumber, item.prod[item.lr_index+1]) - next = None - if item.lr_next in K[goto_setnumber]: - next = item.lr_next - if next: - spontaneous.append((next, (lookahead, goto_setnumber))) - else: - goto_setnumber = lr0_goto_setnumber(setnumber, item.prod[item.lr_index+1]) - next = None - if goto_setnumber > -1: - if item.lr_next in K[goto_setnumber]: - next = item.lr_next - - if next: - propogate.append(((kitem, setnumber), (next, goto_setnumber))) - - - - for x in K[setnumber]: - x.lookaheads[setnumber] = [] - - for x in spontaneous: - if x[1][0] not in x[0].lookaheads[x[1][1]]: - x[0].lookaheads[x[1][1]].append(x[1][0]) - - K[0][0].lookaheads[0] = ['$'] - - pitems = {} - for x in propogate: - if pitems.has_key(x[0]): - pitems[x[0]].append(x[1]) - else: - pitems[x[0]] = [] - pitems[x[0]].append(x[1]) - - # propogate the lookaheads that were spontaneously generated - # based on the propogations produced above - stop = 0 - - while not stop: - stop = 1 - kindex = 0 - for set in K: - for item in set: - pkey = (item, kindex) - if pitems.has_key(pkey): - for propogation in pitems[pkey]: - gitem = propogation[0] - gsetnumber = propogation[1] - glookaheads = gitem.lookaheads[gsetnumber] - for lookahead in item.lookaheads[kindex]: - if lookahead not in glookaheads: - glookaheads.append(lookahead) - stop = 0 - kindex += 1 - -def ReduceNonterminals(): - global Nonterminals - - global TReductions - global NTReductions - - for nt in Nonterminals.keys(): - TReductions[nt] = [] - NTReductions[nt] = [] - - for nt in Nonterminals.keys(): - terms = ReduceToTerminals(nt) - TReductions[nt].extend(terms) - if not NTReductions.has_key(nt): - ReduceToNonterminals(nt) - - - -def ReduceToTerminals(nt): - global Prodnames - global Terminals - reducedterminals = [] - - for p in Prodnames[nt]: - if len(p.prod) > 0: - if Terminals.has_key(p.prod[0]): - if p.prod[0] not in reducedterminals: - reducedterminals.append(p.prod[0]) - else: - if p.prod[0] != nt: - terms = ReduceToTerminals(p.prod[0]) - for t in terms: - if t not in reducedterminals: - reducedterminals.append(t) - - return reducedterminals - - -def ReduceToNonterminals(nt): - global Prodnames - global Nonterminals - global NTReductions - reducednonterminals = [] - - for p in Prodnames[nt]: - if len(p.prod) > 0: - if Nonterminals.has_key(p.prod[0]): - if p.prod[0] not in reducednonterminals: - reducednonterminals.append(p.prod[0]) - if p.prod[0] != nt: - if not NTReductions.has_key(p.prod[0]): - ReduceToNonterminals(p.prod[0]) - - nterms = NTReductions[p.prod[0]] - for nt in nterms: - if nt not in reducednonterminals: - reducednonterminals.append(nt) - - - NTReductions[nt] = reducednonterminals - -# ----------------------------------------------------------------------------- -# lalr_parse_table() -# -# This function constructs an LALR table. -# ----------------------------------------------------------------------------- -def lalr_parse_table(): - global _lr_method - goto = _lr_goto # Goto array - action = _lr_action # Action array - actionp = { } # Action production array (temporary) - goto_cache = _lr_goto_cache - cid_hash = _lr0_cidhash - - _lr_method = "LALR" - - n_srconflict = 0 - n_rrconflict = 0 - - if yaccdebug: - sys.stderr.write("yacc: Generating LALR(1) parsing table...\n") - _vf.write("\n\nParsing method: LALR(1)\n\n") - - # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items - # This determines the number of states - - C = lr0_items() - - global Canonical - Canonical = C - - ### - # Create the kernel states. - ### - K = [] - setC = [0]*len(C) - for x in C: - K.append(lr0_kernel(x)) - for y in x: - y.setnumbers = setC[:] - - _cindex = 0 - for x in C: - for y in x: - y.lookaheads[_cindex] = [] - y.setnumbers[_cindex] = 1 - _cindex = _cindex + 1 - - ### - # Add lookaheads to the lr items - ### - - add_lookaheads(K) - - ### - # Do the reductions for parsing first and keep them in globals - ### - - ReduceNonterminals() - - global TReductions - global NTReductions - global Prodempty - - EmptyAncestors = {} - for y in Prodempty.keys(): - EmptyAncestors[y] = [] - for x in NTReductions.items(): - for y in x[1]: - if Prodempty.has_key(y): - EmptyAncestors[y].append(x[0]) - - - # Build the parser table, state by state - st = 0 - for I in C: - # Loop over each production in I - actlist = [ ] # List of actions - acthash = { } - - idI = id(I) - - if yaccdebug: - _vf.write("\nstate %d\n\n" % st) - for p in I: - _vf.write(" (%d) %s\n" % (p.number, str(p))) - _vf.write("\n") - - global First - for p in I: - try: - if p.prod[-1] == ".": - if p.name == "S'": - # Start symbol. Accept! - action[st,"$"] = 0 - actionp[st,"$"] = p - elif len(p.prod) == 0: - ancestors = EmptyAncestors[p.name] - for i in ancestors: - for s in K: - if i in s: - input_list = [] - plist = Productions[i.name] - for x in plist: - if len(x.prod) > 0 and x.prod[0] == p.name: - n = p.prod[1:] - d = x.prod[lr_index+2:] - for l in x.lookaheads.items(): - flist = First[tuple(n+d+[l])] - for f in flist: - if f not in input_list and f in p.lookaheads[st]: - input_list.append(f) - - # We are at the end of a production. Reduce! - #print "input_list: %s" % input_list - #print "Follow[p.name]: %s" % Follow[p.name] - for a in input_list: - actlist.append((a,p,"reduce using rule %d (%s) " % (p.number,p))) - r = action.get((st,a),None) - if r is not None: - # Whoa. Have a shift/reduce or reduce/reduce conflict - if r > 0: - # Need to decide on shift or reduce here - # By default we favor shifting. Need to add - # some precedence rules here. - sprec,slevel = Productions[actionp[st,a].number].prec - rprec,rlevel = Precedence.get(a,('right',0)) - if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): - # We really need to reduce here. - action[st,a] = -p.number - actionp[st,a] = p - if not slevel and not rlevel: - _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) - n_srconflict += 1 - elif (slevel == rlevel) and (rprec == 'nonassoc'): - action[st,a] = None - else: - # Hmmm. Guess we'll keep the shift - if not slevel and not rlevel: - _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) - n_srconflict +=1 - elif r < 0: - # Reduce/reduce conflict. In this case, we favor the rule - # that was defined first in the grammar file - oldp = Productions[-r] - pp = Productions[p.number] - if oldp.line > pp.line: - action[st,a] = -p.number - actionp[st,a] = p - # print "Reduce/reduce conflict in state %d" % st - n_rrconflict += 1 - _vfc.write("reduce/reduce conflict in state %d resolved using rule %d.\n" % (st, actionp[st,a].number)) - _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d.\n" % (a,actionp[st,a].number)) - else: - sys.stderr.write("Unknown conflict in state %d\n" % st) - else: - action[st,a] = -p.number - actionp[st,a] = p - - break # break out of the for s in K loop because we only want to make - # sure that a production is in the Kernel - - else: - # We are at the end of a production. Reduce! - - for a in p.lookaheads[st]: - actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p))) - r = action.get((st,a),None) - if r is not None: - # Whoa. Have a shift/reduce or reduce/reduce conflict - if r > 0: - # Need to decide on shift or reduce here - # By default we favor shifting. Need to add - # some precedence rules here. - sprec,slevel = Productions[actionp[st,a].number].prec - rprec,rlevel = Precedence.get(a,('right',0)) - if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): - # We really need to reduce here. - action[st,a] = -p.number - actionp[st,a] = p - if not slevel and not rlevel: - _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) - n_srconflict += 1 - elif (slevel == rlevel) and (rprec == 'nonassoc'): - action[st,a] = None - else: - # Hmmm. Guess we'll keep the shift - if not slevel and not rlevel: - _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) - n_srconflict +=1 - elif r < 0: - # Reduce/reduce conflict. In this case, we favor the rule - # that was defined first in the grammar file - oldp = Productions[-r] - pp = Productions[p.number] - if oldp.line > pp.line: - action[st,a] = -p.number - actionp[st,a] = p - # print "Reduce/reduce conflict in state %d" % st - n_rrconflict += 1 - _vfc.write("reduce/reduce conflict in state %d resolved using rule %d.\n" % (st, actionp[st,a].number)) - _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d.\n" % (a,actionp[st,a].number)) - else: - print "Unknown conflict in state %d" % st - else: - action[st,a] = -p.number - actionp[st,a] = p - else: - i = p.lr_index - a = p.prod[i+1] # Get symbol right after the "." - if Terminals.has_key(a): - g = goto_cache[(idI,a)] - j = cid_hash.get(id(g),-1) - if j >= 0: - # We are in a shift state - _k = (a,j) - if not acthash.has_key(_k): - actlist.append((a,p,"shift and go to state %d" % j)) - acthash[_k] = 1 - r = action.get((st,a),None) - if r is not None: - # Whoa have a shift/reduce or shift/shift conflict - if r > 0: - if r != j: - sys.stderr.write("Shift/shift conflict in state %d\n" % st) - elif r < 0: - # Do a precedence check. - # - if precedence of reduce rule is higher, we reduce. - # - if precedence of reduce is same and left assoc, we reduce. - # - otherwise we shift - rprec,rlevel = Productions[actionp[st,a].number].prec - sprec,slevel = Precedence.get(a,('right',0)) - if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')): - # We decide to shift here... highest precedence to shift - action[st,a] = j - actionp[st,a] = p - if not slevel and not rlevel: - n_srconflict += 1 - _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) - elif (slevel == rlevel) and (rprec == 'nonassoc'): - action[st,a] = None - else: - # Hmmm. Guess we'll keep the reduce - if not slevel and not rlevel: - n_srconflict +=1 - _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) - - else: - sys.stderr.write("Unknown conflict in state %d\n" % st) - else: - action[st,a] = j - actionp[st,a] = p - else: - nonterminal = a - term_list = TReductions[nonterminal] - # DB: This loop gets executed a lot. Try to optimize - for a in term_list: - g = goto_cache[(idI,a)] - j = cid_hash[id(g)] - if j >= 0: - # We are in a shift state - # Don't put repeated shift actions on action list (performance hack) - _k = (a,j) - if not acthash.has_key(_k): - actlist.append((a,p,"shift and go to state "+str(j))) - acthash[_k] = 1 - - r = action.get((st,a),None) - if r is not None: - # Whoa have a shift/reduce or shift/shift conflict - if r > 0: - if r != j: - sys.stderr.write("Shift/shift conflict in state %d\n" % st) - continue - elif r < 0: - # Do a precedence check. - # - if precedence of reduce rule is higher, we reduce. - # - if precedence of reduce is same and left assoc, we reduce. - # - otherwise we shift - rprec,rlevel = Productions[actionp[st,a].number].prec - sprec,slevel = Precedence.get(a,('right',0)) - if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')): - # We decide to shift here... highest precedence to shift - action[st,a] = j - actionp[st,a] = p - if not slevel and not rlevel: - n_srconflict += 1 - _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) - elif (slevel == rlevel) and (rprec == 'nonassoc'): - action[st,a] = None - else: - # Hmmm. Guess we'll keep the reduce - if not slevel and not rlevel: - n_srconflict +=1 - _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) - - else: - sys.stderr.write("Unknown conflict in state %d\n" % st) - else: - action[st,a] = j - actionp[st,a] = p - - except StandardError,e: - raise YaccError, "Hosed in lalr_parse_table", e - - # Print the actions associated with each terminal - if yaccdebug: - for a,p,m in actlist: - if action.has_key((st,a)): - if p is actionp[st,a]: - _vf.write(" %-15s %s\n" % (a,m)) - _vf.write("\n") - - for a,p,m in actlist: - if action.has_key((st,a)): - if p is not actionp[st,a]: - _vf.write(" ! %-15s [ %s ]\n" % (a,m)) - - # Construct the goto table for this state - nkeys = { } - for ii in I: - for s in ii.usyms: - if Nonterminals.has_key(s): - nkeys[s] = None - - # Construct the goto table for this state - for n in nkeys.keys(): - g = lr0_goto(I,n) - j = cid_hash.get(id(g),-1) - if j >= 0: - goto[st,n] = j - if yaccdebug: - _vf.write(" %-30s shift and go to state %d\n" % (n,j)) - - st += 1 - if yaccdebug: - if n_srconflict == 1: - sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict) - if n_srconflict > 1: - sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict) - if n_rrconflict == 1: - sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict) - if n_rrconflict > 1: - sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict) - - -# ----------------------------------------------------------------------------- -# ==== LR Utility functions ==== -# ----------------------------------------------------------------------------- - -# ----------------------------------------------------------------------------- -# _lr_write_tables() -# -# This function writes the LR parsing tables to a file -# ----------------------------------------------------------------------------- - -def lr_write_tables(modulename=tab_module): - filename = modulename + ".py" - try: - f = open(filename,"w") - - f.write(""" -# %s -# This file is automatically generated. Do not edit. - -_lr_method = %s - -_lr_signature = %s -""" % (filename, repr(_lr_method), repr(Signature.digest()))) - - # Change smaller to 0 to go back to original tables - smaller = 1 - - # Factor out names to try and make smaller - if smaller: - items = { } - - for k,v in _lr_action.items(): - i = items.get(k[1]) - if not i: - i = ([],[]) - items[k[1]] = i - i[0].append(k[0]) - i[1].append(v) - - f.write("\n_lr_action_items = {") - for k,v in items.items(): - f.write("%r:([" % k) - for i in v[0]: - f.write("%r," % i) - f.write("],[") - for i in v[1]: - f.write("%r," % i) - - f.write("]),") - f.write("}\n") - - f.write(""" -_lr_action = { } -for _k, _v in _lr_action_items.items(): - for _x,_y in zip(_v[0],_v[1]): - _lr_action[(_x,_k)] = _y -del _lr_action_items -""") - - else: - f.write("\n_lr_action = { "); - for k,v in _lr_action.items(): - f.write("(%r,%r):%r," % (k[0],k[1],v)) - f.write("}\n"); - - if smaller: - # Factor out names to try and make smaller - items = { } - - for k,v in _lr_goto.items(): - i = items.get(k[1]) - if not i: - i = ([],[]) - items[k[1]] = i - i[0].append(k[0]) - i[1].append(v) - - f.write("\n_lr_goto_items = {") - for k,v in items.items(): - f.write("%r:([" % k) - for i in v[0]: - f.write("%r," % i) - f.write("],[") - for i in v[1]: - f.write("%r," % i) - - f.write("]),") - f.write("}\n") - - f.write(""" -_lr_goto = { } -for _k, _v in _lr_goto_items.items(): - for _x,_y in zip(_v[0],_v[1]): - _lr_goto[(_x,_k)] = _y -del _lr_goto_items -""") - else: - f.write("\n_lr_goto = { "); - for k,v in _lr_goto.items(): - f.write("(%r,%r):%r," % (k[0],k[1],v)) - f.write("}\n"); - - # Write production table - f.write("_lr_productions = [\n") - for p in Productions: - if p: - if (p.func): - f.write(" (%r,%d,%r,%r,%d),\n" % (p.name, p.len, p.func.__name__,p.file,p.line)) - else: - f.write(" (%r,%d,None,None,None),\n" % (p.name, p.len)) - else: - f.write(" None,\n") - f.write("]\n") - f.close() - - except IOError,e: - print "Unable to create '%s'" % filename - print e - return - -def lr_read_tables(module=tab_module,optimize=0): - global _lr_action, _lr_goto, _lr_productions, _lr_method - try: - exec "import %s as parsetab" % module - - if (optimize) or (Signature.digest() == parsetab._lr_signature): - _lr_action = parsetab._lr_action - _lr_goto = parsetab._lr_goto - _lr_productions = parsetab._lr_productions - _lr_method = parsetab._lr_method - return 1 - else: - return 0 - - except (ImportError,AttributeError): - return 0 - -# ----------------------------------------------------------------------------- -# yacc(module) -# -# Build the parser module -# ----------------------------------------------------------------------------- - -def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, start=None, check_recursion=1, optimize=0,write_tables=1,debugfile=debug_file): - global yaccdebug - yaccdebug = debug - - initialize_vars() - files = { } - error = 0 - - # Add starting symbol to signature - if start: - Signature.update(start) - - # Add parsing method to signature - Signature.update(method) - - # If a "module" parameter was supplied, extract its dictionary. - # Note: a module may in fact be an instance as well. - - if module: - # User supplied a module object. - if isinstance(module, types.ModuleType): - ldict = module.__dict__ - elif isinstance(module, types.InstanceType): - _items = [(k,getattr(module,k)) for k in dir(module)] - ldict = { } - for i in _items: - ldict[i[0]] = i[1] - else: - raise ValueError,"Expected a module" - - else: - # No module given. We might be able to get information from the caller. - # Throw an exception and unwind the traceback to get the globals - - try: - raise RuntimeError - except RuntimeError: - e,b,t = sys.exc_info() - f = t.tb_frame - f = f.f_back # Walk out to our calling function - ldict = f.f_globals # Grab its globals dictionary - - # If running in optimized mode. We're going to - - if (optimize and lr_read_tables(tabmodule,1)): - # Read parse table - del Productions[:] - for p in _lr_productions: - if not p: - Productions.append(None) - else: - m = MiniProduction() - m.name = p[0] - m.len = p[1] - m.file = p[3] - m.line = p[4] - if p[2]: - m.func = ldict[p[2]] - Productions.append(m) - - else: - # Get the tokens map - if (module and isinstance(module,types.InstanceType)): - tokens = getattr(module,"tokens",None) - else: - tokens = ldict.get("tokens",None) - - if not tokens: - raise YaccError,"module does not define a list 'tokens'" - if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)): - raise YaccError,"tokens must be a list or tuple." - - # Check to see if a requires dictionary is defined. - requires = ldict.get("require",None) - if requires: - if not (isinstance(requires,types.DictType)): - raise YaccError,"require must be a dictionary." - - for r,v in requires.items(): - try: - if not (isinstance(v,types.ListType)): - raise TypeError - v1 = [x.split(".") for x in v] - Requires[r] = v1 - except StandardError: - print "Invalid specification for rule '%s' in require. Expected a list of strings" % r - - - # Build the dictionary of terminals. We a record a 0 in the - # dictionary to track whether or not a terminal is actually - # used in the grammar - - if 'error' in tokens: - print "yacc: Illegal token 'error'. Is a reserved word." - raise YaccError,"Illegal token name" - - for n in tokens: - if Terminals.has_key(n): - print "yacc: Warning. Token '%s' multiply defined." % n - Terminals[n] = [ ] - - Terminals['error'] = [ ] - - # Get the precedence map (if any) - prec = ldict.get("precedence",None) - if prec: - if not (isinstance(prec,types.ListType) or isinstance(prec,types.TupleType)): - raise YaccError,"precedence must be a list or tuple." - add_precedence(prec) - Signature.update(repr(prec)) - - for n in tokens: - if not Precedence.has_key(n): - Precedence[n] = ('right',0) # Default, right associative, 0 precedence - - # Look for error handler - ef = ldict.get('p_error',None) - if ef: - if isinstance(ef,types.FunctionType): - ismethod = 0 - elif isinstance(ef, types.MethodType): - ismethod = 1 - else: - raise YaccError,"'p_error' defined, but is not a function or method." - eline = ef.func_code.co_firstlineno - efile = ef.func_code.co_filename - files[efile] = None - - if (ef.func_code.co_argcount != 1+ismethod): - raise YaccError,"%s:%d: p_error() requires 1 argument." % (efile,eline) - global Errorfunc - Errorfunc = ef - else: - print "yacc: Warning. no p_error() function is defined." - - # Get the list of built-in functions with p_ prefix - symbols = [ldict[f] for f in ldict.keys() - if (type(ldict[f]) in (types.FunctionType, types.MethodType) and ldict[f].__name__[:2] == 'p_' - and ldict[f].__name__ != 'p_error')] - - # Check for non-empty symbols - if len(symbols) == 0: - raise YaccError,"no rules of the form p_rulename are defined." - - # Sort the symbols by line number - symbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno)) - - # Add all of the symbols to the grammar - for f in symbols: - if (add_function(f)) < 0: - error += 1 - else: - files[f.func_code.co_filename] = None - - # Make a signature of the docstrings - for f in symbols: - if f.__doc__: - Signature.update(f.__doc__) - - lr_init_vars() - - if error: - raise YaccError,"Unable to construct parser." - - if not lr_read_tables(tabmodule): - - # Validate files - for filename in files.keys(): - if not validate_file(filename): - error = 1 - - # Validate dictionary - validate_dict(ldict) - - if start and not Prodnames.has_key(start): - raise YaccError,"Bad starting symbol '%s'" % start - - augment_grammar(start) - error = verify_productions(cycle_check=check_recursion) - otherfunc = [ldict[f] for f in ldict.keys() - if (type(f) in (types.FunctionType,types.MethodType) and ldict[f].__name__[:2] != 'p_')] - - if error: - raise YaccError,"Unable to construct parser." - - build_lritems() - compute_first1() - compute_follow(start) - - if method == 'SLR': - slr_parse_table() - elif method == 'LALR': - lalr_parse_table() - else: - raise YaccError, "Unknown parsing method '%s'" % method - - if write_tables: - lr_write_tables(tabmodule) - - if yaccdebug: - try: - f = open(debugfile,"w") - f.write(_vfc.getvalue()) - f.write("\n\n") - f.write(_vf.getvalue()) - f.close() - except IOError,e: - print "yacc: can't create '%s'" % debugfile,e - - # Made it here. Create a parser object and set up its internal state. - # Set global parse() method to bound method of parser object. - - p = Parser("xyzzy") - p.productions = Productions - p.errorfunc = Errorfunc - p.action = _lr_action - p.goto = _lr_goto - p.method = _lr_method - p.require = Requires - - global parse - parse = p.parse - - # Clean up all of the globals we created - if (not optimize): - yacc_cleanup() - return p - -# yacc_cleanup function. Delete all of the global variables -# used during table construction - -def yacc_cleanup(): - global _lr_action, _lr_goto, _lr_method, _lr_goto_cache - del _lr_action, _lr_goto, _lr_method, _lr_goto_cache - - global Productions, Prodnames, Prodmap, Terminals - global Nonterminals, First, Follow, Precedence, LRitems - global Errorfunc, Signature, Requires - global Prodempty, TReductions, NTReductions, GotoSetNum, Canonical - - del Productions, Prodnames, Prodmap, Terminals - del Nonterminals, First, Follow, Precedence, LRitems - del Errorfunc, Signature, Requires - del Prodempty, TReductions, NTReductions, GotoSetNum, Canonical - - global _vf, _vfc - del _vf, _vfc - - -# Stub that raises an error if parsing is attempted without first calling yacc() -def parse(*args,**kwargs): - raise YaccError, "yacc: No parser built with yacc()" - -