bfe_webjournal_articles_overview.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Aug 5, 05:13

bfe_webjournal_articles_overview.py
View Options

	# -- coding: utf-8 --
	# $Id: bfe_webjournal_MainArticleOverview.py,v 1.28 2009/02/12 10:00:57 jerome Exp $
	#
	# This file is part of Invenio.
	# Copyright (C) 2009, 2010, 2011 CERN.
	#
	# Invenio is free software; you can redistribute it and/or
	# modify it under the terms of the GNU General Public License as
	# published by the Free Software Foundation; either version 2 of the
	# License, or (at your option) any later version.
	#
	# Invenio is distributed in the hope that it will be useful, but
	# WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with Invenio; if not, write to the Free Software Foundation, Inc.,
	# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
	"""
	WebJournal Element - Creates an overview of all the articles of a
	certain category in one specific issue.
	"""
	import re
	import os
	import urllib, urllib2
	try:
	from PIL import Image
	PIL_imported = True
	except ImportError:
	PIL_imported = False
	from invenio.bibformat_engine import BibFormatObject
	from invenio.htmlutils import HTMLWasher, remove_html_markup
	from invenio.messages import gettext_set_language
	from invenio.config import \
	CFG_ACCESS_CONTROL_LEVEL_SITE, \
	CFG_TMPDIR, \
	CFG_SITE_LANG
	from invenio.webjournal_utils import \
	cache_index_page, \
	get_index_page_from_cache, \
	parse_url_string, \
	make_journal_url, \
	get_journal_articles, \
	issue_is_later_than, \
	get_current_issue
	from invenio.webjournal_utils import \
	img_pattern, \
	header_pattern, \
	header_pattern2, \
	para_pattern
	from invenio.urlutils import create_html_link
	from invenio.bibdocfile import decompose_file

	def format_element(bfo, number_of_featured_articles="1",
	number_of_articles_with_image="3", new_articles_first='yes',
	image_px_width="300", small_image_px_width="200",
	subject_to_css_class_kb="WebJournalSubject2CSSClass",
	link_image_to_article='yes', image_alignment='left'):
	"""
	Creates an overview of all the articles of a certain category in one
	specific issue.

	Note the following:
	<ul>
	<li>The element consider only the latest issue: when viewing
	archives of your journal, readers will see the newest articles of
	the latest issue, not the ones of the issue they are looking
	at</li>

	<li>This is not an index of the articles of the latest issue: it
	display only <b>new</b> articles, that is articles that have never
	appeared in a previous issue</li>

	<li>This element produces a table-based layout, in order to have a
	more or less readable HTML alert when sent some Email clients
	(Outlook 2007)</li>

	<li>When producing the HTML output of images, this element tries to
	insert the width and height attributes to the img tag: this is
	necessary in order to produce nice HTML alerts. This dimension
	therefore overrides any dimension defined in the CSS. The Python
	Image Library (PIL) should be installed for this element to
	recognize the size of images.</li>
	</ul>

	@param number_of_featured_articles: the max number of records with emphasized title
	@param number_of_articles_with_image: the max number of records for which their image is displayed
	@param new_articles_first: if 'yes', display new articles before other articles
	@param image_px_width: (integer) width of first image featured on this page
	@param small_image_px_width: (integer) width of small images featured on this page
	@param subject_to_css_class_kb: knowledge base that maps 595__a to a CSS class
	@param link_image_to_article: if 'yes', link image (if any) to article
	@param image_alignment: 'left', 'center' or 'right'. To help rendering in Outlook.
	"""
	args = parse_url_string(bfo.user_info['uri'])
	journal_name = args["journal_name"]
	this_issue_number = args["issue"]
	category_name = args["category"]
	verbose = args["verbose"]
	ln = bfo.lang
	_ = gettext_set_language(ln)

	if image_px_width.isdigit():
	image_px_width = int(image_px_width)
	else:
	image_px_width = None
	if small_image_px_width.isdigit():
	small_image_px_width = int(small_image_px_width)
	else:
	small_image_px_width = None

	# We want to put emphasis on the n first articles (which are not
	# new)
	if number_of_featured_articles.isdigit():
	number_of_featured_articles = int(number_of_featured_articles)
	else:
	number_of_featured_articles = 0

	# Only n first articles will display images
	if number_of_articles_with_image.isdigit():
	number_of_articles_with_image = int(number_of_articles_with_image)
	else:
	number_of_articles_with_image = 0

	# Help image alignement without CSS, to have better rendering in Outlook
	img_align = ''
	if image_alignment:
	img_align = 'align="%s"' % image_alignment

	# Try to get the page from cache. Only if issue is older or equal
	# to latest release.
	latest_released_issue = get_current_issue(ln, journal_name)
	if verbose == 0 and not issue_is_later_than(this_issue_number,
	latest_released_issue):
	cached_html = get_index_page_from_cache(journal_name, category_name,
	this_issue_number, ln)
	if cached_html:
	return cached_html

	out = '<table border="0" cellpadding="0" cellspacing="0">'
	# Get the id list
	ordered_articles = get_journal_articles(journal_name,
	this_issue_number,
	category_name,
	newest_first=new_articles_first.lower() == 'yes')
	new_articles_only = False
	if ordered_articles.keys() and max(ordered_articles.keys()) < 0:
	# If there are only new articles, don't bother marking them as
	# new
	new_articles_only = True

	order_numbers = ordered_articles.keys()
	order_numbers.sort()
	img_css_class = "featuredImageScale"

	for order_number in order_numbers:
	for article_id in ordered_articles[order_number]:
	# A record is considered as new if its position is
	# negative and there are some non-new articles
	article_is_new = (order_number < 0 and not new_articles_only)

	temp_rec = BibFormatObject(article_id)
	title = ''
	if ln == "fr":
	title = temp_rec.field('246_1a')
	if title == '':
	title = temp_rec.field('245__a')
	else:
	title = temp_rec.field('245__a')
	if title == '':
	title = temp_rec.field('246_1a')

	# Get CSS class (if relevant)
	notes = temp_rec.fields('595__a')
	css_classes = [temp_rec.kb(subject_to_css_class_kb, note, None) \
	for note in notes]
	css_classes = [css_class for css_class in css_classes \
	if css_class is not None]

	if article_is_new:
	css_classes.append('new')

	# Maybe we want to force image to appear?
	display_image_on_index = False
	if 'display_image_on_index' in notes:
	display_image_on_index = True

	# Build generic link to this article
	article_link = make_journal_url(bfo.user_info['uri'], {'recid':str(article_id),
	'ln': bfo.lang})

	# Build the "more" link
	more_link = '''<a class="readMore" title="link to the article" href="%s"> >> </a>
	''' % (article_link)

	# If we should display an image along with the text,
	# prepare it here
	img = ''
	if (number_of_articles_with_image > 0 and \
	not article_is_new) or display_image_on_index:
	img = _get_feature_image(temp_rec, ln)
	if img != "":
	# Now we will try to identify image size in order
	# to resize it in the HTML for a nicer rendering
	# of the HTML alert in email clients (Outlook wants
	# both height and width)
	img_width = None
	img_height = None
	small_img_width = None
	small_img_height = None
	width_and_height = ''
	if PIL_imported:
	try:
	local_img = os.path.join(CFG_TMPDIR,
	'webjournal_' + \
	''.join([char for char in img \
	if char.isalnum()]))
	if len(local_img) > 255:
	# Shorten to 255 chars
	local_img = local_img[0:100] + '_' + local_img[156:]
	if not os.path.exists(local_img):
	# Too bad, must download entire image for PIL
	content_type = get_content_type(img)
	if 'image' in content_type:
	(local_img, headers) = urllib.urlretrieve(img, local_img)
	img_file = Image.open(local_img) # IOError if not readable image
	else:
	raise IOError('Not an image')
	else:
	img_file = Image.open(local_img) # IOError if not readable image
	except IOError, e:
	pass
	else:
	orig_img_width = img_file.size[0]
	orig_img_height = img_file.size[1]
	# Then scale according to user-defined width
	## First image
	ratio = float(orig_img_width) / image_px_width
	img_width = image_px_width
	img_height = int(orig_img_height / ratio)
	## Other smaller images
	ratio = float(orig_img_width) / small_image_px_width
	small_img_width = small_image_px_width
	small_img_height = int(orig_img_height / ratio)

	# Note that we cannot reuse the nice phl, ph and
	# phr classes to put a frame around the image:
	# this is not supported in Outlook 2007 when HTML
	# alert is sent.
	if not img_css_class == "featuredImageScale":
	# Not first image: display smaller
	img_width = small_img_width
	img_height = small_img_height

	if img_width and img_height:
	width_and_height = 'width="%i" height="%i"' % \
	(img_width, img_height)
	img = '<img alt="" class="%s" src="%s" %s %s/>' % \
	(img_css_class, img, img_align, width_and_height)
	number_of_articles_with_image -= 1

	# Next images will be displayed smaller
	img_css_class = "featuredImageScaleSmall"

	# Determine size of the title
	header_tag_size = '3'
	if number_of_featured_articles > 0 and \
	not article_is_new:
	# n first articles are especially featured
	header_tag_size = '2'
	number_of_featured_articles -= 1

	# Finally create the output. Two different outputs
	# depending on if we have text to display or not
	text = ''
	if not article_is_new:
	text = _get_feature_text(temp_rec, ln)
	# Link image to article if wanted
	if link_image_to_article.lower() == 'yes':
	img = create_html_link(urlbase=article_link,
	link_label=img,
	urlargd={})
	if text != '':
	out += '''
	<tr><td class="article">
	<h%(header_tag_size)s class="%(css_classes)s articleTitle" style="clear:both;">
	<a title="link to the article" href="%(article_link)s">%(title)s</a>
	</h%(header_tag_size)s>
	<div class="articleBody">
	%(img)s
	%(text)s
	%(more_link)s
	</div>
	</td></tr>
	''' % {'article_link': article_link,
	'title': title,
	'img': img,
	'text': text,
	'more_link': more_link,
	'css_classes': ' '.join(css_classes),
	'header_tag_size': header_tag_size}
	else:
	out += '''
	<tr><td class="article">
	<h%(header_tag_size)s class="%(css_classes)s articleTitle" style="clear:both;">
	<a title="link to the article" href="%(article_link)s">%(title)s</a>
	%(more_link)s
	</h%(header_tag_size)s>
	%(img)s
	</td></tr>
	''' % {'article_link': article_link,
	'title': title,
	'more_link': more_link,
	'img': img,
	'css_classes': ' '.join(css_classes),
	'header_tag_size': header_tag_size}
	out += '</table>'
	if verbose == 0 and not CFG_ACCESS_CONTROL_LEVEL_SITE == 2 :
	cache_index_page(out, journal_name, category_name,
	this_issue_number, ln)

	return out

	def escape_values(bfo):
	"""
	Called by BibFormat in order to check if output of this element
	should be escaped.
	"""
	return 0

	def _get_feature_image(record, ln=CFG_SITE_LANG):
	"""
	Looks for an image that can be featured on the article overview page.
	"""
	src = ''
	if ln == "fr":
	article = ''.join(record.fields('590__b'))
	if not article:
	article = ''.join(record.fields('520__b'))
	else:
	article = ''.join(record.fields('520__b'))
	if not article:
	article = ''.join(record.fields('590__b'))

	image = re.search(img_pattern, article)
	if image:
	src = image.group("image")
	if not src:
	# Look for an attached image
	icons = [icon for icon in record.fields('8564_q') if \
	(decompose_file(icon)[2] in ['jpg', 'jpeg', 'png', 'gif'])]
	if icons:
	src = icons[0]
	return src

	def _get_first_sentence_or_part(header_text):
	"""
	Tries to cut the text at the end of the first sentence or an empty space
	between char 200 and 300. Else return 250 first chars.
	"""
	header_text = header_text.lstrip()
	first_sentence = header_text[100:].find(".")
	if first_sentence == -1:
	# try question mark
	first_sentence = header_text[100:].find("?")
	if first_sentence == -1:
	# try exclamation mark
	first_sentence = header_text[100:].find("!")
	if first_sentence != -1 and first_sentence < 250:
	return "%s." % header_text[:(100+first_sentence)]
	else:
	an_empty_space = header_text[200:].find(" ")
	if an_empty_space != -1 and an_empty_space < 300:
	return "%s..." % header_text[:(200+an_empty_space)]
	else:
	return "%s..." % header_text[:250]

	def _get_feature_text(record, language):
	"""
	Looks for a text (header) that can be featured on the article overview
	page.
	"""
	washer = HTMLWasher()
	header_text = ""
	# Check if there is a header
	if language == "fr":
	header = record.field('590__a')
	if header.strip() in \
	['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']:
	header = record.field('520__a')
	else:
	header = record.field('520__a')
	if header.strip() in \
	['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']:
	header = record.field('590__a')
	header = washer.wash(html_buffer=header,
	allowed_tag_whitelist=[],
	allowed_attribute_whitelist=[])
	if header != "":
	header_text = header
	else:
	if language == "fr":
	article = record.fields('590__b')
	if not article or \
	(len(article) == 1 and \
	article[0].strip() in \
	['', '<br />', '<!--HTML--><br />', '<!--HTML-->']):
	article = record.fields('520__b')
	else:
	article = record.fields('520__b')
	if not article or \
	(len(article) == 1 and \
	article[0].strip() in \
	['', '<br />', '<!--HTML--><br />', '<!--HTML-->']):
	article = record.fields('590__b')
	try:
	article = article[0]
	except:
	return ''

	match_obj = re.search(header_pattern, article)
	if not match_obj:
	match_obj = re.search(header_pattern2, article)
	try:
	header_text = match_obj.group("header")
	header_text = washer.wash(html_buffer=header_text,
	allowed_tag_whitelist=['a'],
	allowed_attribute_whitelist=['href',
	'target',
	'class'])
	if header_text == "":
	raise Exception
	except:
	article = article.replace(header_text, '')
	article = article.replace('<p/>', '')
	article = article.replace('<p> </p>', '')
	match_obj = re.search(para_pattern, article)
	try:
	# get the first paragraph
	header_text = match_obj.group("paragraph")
	try:
	header_text = washer.wash(html_buffer=header_text,
	allowed_tag_whitelist=[],
	allowed_attribute_whitelist=[])
	except:
	# was not able to parse correctly the HTML. Use
	# this safer function, but producing less good
	# results
	header_text = remove_html_markup(header_text)

	if header_text.strip() == "":
	raise Exception
	else:
	if len(header_text) > 250:
	header_text = _get_first_sentence_or_part(header_text)
	except:
	# in a last instance get the first sentence
	try:
	article = washer.wash(article,
	allowed_tag_whitelist=[],
	allowed_attribute_whitelist=[])
	except:
	# was not able to parse correctly the HTML. Use
	# this safer function, but producing less good
	# results
	article = remove_html_markup(article)

	header_text = _get_first_sentence_or_part(article)

	return header_text

	def get_content_type(url):
	"""
	Returns the content-type of the given URL.
	Return empty string if content-type could not be resolved

	@param url: URL for which we would like to get the content-type
	@type url: string
	@return: the content-type of the given URL
	@rtype: string
	"""
	req = urllib2.Request(url)
	try:
	response = urllib2.urlopen(req)
	return response.info().getheader('content-type')
	except Exception, e:
	return ''

bfe_webjournal_articles_overview.pyNo OneTemporaryActions

File Metadata

bfe_webjournal_articles_overview.pyView Options

Event Timeline

bfe_webjournal_articles_overview.py
No OneTemporary
Actions

bfe_webjournal_articles_overview.py
View Options