Source code for streamcorpus_pipeline.force_clean_html
'''Do our best to fill in body.clean_html.
.. This software is released under an MIT/X11 open source license.
Copyright 2012-2015 Diffeo, Inc.
.. autoclass:: force_clean_html
'''
from __future__ import absolute_import, division, print_function
import logging
from streamcorpus_pipeline._clean_html import make_clean_html
from streamcorpus_pipeline._clean_visible import clean_visible
from streamcorpus_pipeline._exceptions import InvalidStreamItem
from streamcorpus_pipeline.stages import Configured
logger = logging.getLogger(__name__)
[docs]class force_clean_html(Configured):
'''force :attr:`~StreamItem.body.clean_html` to be populated or
rejects the StreamItem.
'''
config_name = 'force_clean_html'
def __call__(self, stream_item, context):
if stream_item.body.clean_html is not None and \
len(stream_item.body.clean_html) > 0:
return stream_item
if stream_item.body.clean_visible is None:
logger.warning('stream item %s has neither clean_visible nor '
'clean_html', stream_item.stream_id)
raise InvalidStreamItem
# With only clean visible, the best we can do is wrap it in a <pre>
# tag and hope for the best.
clean_vis_as_html = '<pre>%s</pre>' % stream_item.body.clean_visible
stream_item.body.clean_html = make_clean_html(
clean_vis_as_html, stream_item=stream_item)
# Since `clean_visible` has several invariants coupled with
# `clean_html`, we need to regenerate it. It's a bit circuitous, but
# less likely to fail.
stream_item = clean_visible({})(stream_item, context)
# check again to make sure we got something
if stream_item.body.clean_html is not None and \
len(stream_item.body.clean_html) > 0:
return stream_item