Source code for afew.utils

# coding=utf-8
from __future__ import print_function, absolute_import, unicode_literals

#
# Copyright (c) Justus Winter <4winter@informatik.uni-hamburg.de>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#

import codecs
import re
import sys
import email
from datetime import datetime

signature_line_re = re.compile(r'^((--)|(__)|(==)|(\*\*)|(##))')
[docs]def strip_signatures(lines, max_signature_size = 10): r''' Strip signatures from a mail. Used to filter mails before classifying mails. :param lines: a mail split at newlines :type lines: :class:`list` of :class:`str` :param max_signature_size: consider message parts up to this size as signatures :type max_signature_size: int :returns: the mail with signatures stripped off :rtype: :class:`list` of :class:`str` >>> strip_signatures([ ... 'Huhu', ... '--', ... 'Ikke', ... ]) ['Huhu'] >>> strip_signatures([ ... 'Huhu', ... '--', ... 'Ikke', ... '**', ... "Sponsored by PowerDoh\'", ... "Sponsored by PowerDoh\'", ... "Sponsored by PowerDoh\'", ... "Sponsored by PowerDoh\'", ... "Sponsored by PowerDoh\'", ... ], 5) ['Huhu'] ''' siglines = 0 sigline_count = 0 for n, line in enumerate(reversed(lines)): if signature_line_re.match(line): # set the last line to include siglines = n + 1 # reset the line code sigline_count = 0 if sigline_count >= max_signature_size: break sigline_count += 1 return lines[:-siglines]
[docs]def extract_mail_body(message): r''' Extract the plain text body of the message with signatures stripped off. :param message: the message to extract the body from :type message: :class:`notmuch.Message` :returns: the extracted text body :rtype: :class:`list` of :class:`str` ''' if hasattr(email, 'message_from_binary_file'): mail = email.message_from_binary_file(open(message.get_filename(), 'br')) else: if (3, 1) <= sys.version_info < (3, 2): fp = codecs.open(message.get_filename(), 'r', 'utf-8', errors='replace') else: fp = open(message.get_filename()) mail = email.message_from_file(fp) content = [] for part in mail.walk(): if part.get_content_type() == 'text/plain': raw_payload = part.get_payload(decode=True) encoding = part.get_content_charset() if encoding: try: raw_payload = raw_payload.decode(encoding, 'replace') except LookupError: raw_payload = raw_payload.decode(sys.getdefaultencoding(), 'replace') else: raw_payload = raw_payload.decode(sys.getdefaultencoding(), 'replace') lines = raw_payload.split('\n') lines = strip_signatures(lines) content.append('\n'.join(lines)) return '\n'.join(content)
[docs]def filter_compat(*args): r''' Compatibility wrapper for filter builtin. The semantic of the filter builtin has been changed in python3.x. This is a temporary workaround to support both python versions in one code base. ''' return list(filter(*args))
def get_message_summary(message): when = datetime.fromtimestamp(float(message.get_date())) sender = get_sender(message) subject = message.get_header('Subject') return '[{date}] {sender} | {subject}'.format(date=when, sender=sender, subject=subject) def get_sender(message): sender = message.get_header('From') name_match = re.search('(.+) <.+@.+\..+>', sender) if name_match: sender = name_match.group(1) return sender