Source code for afew.DBACL

# coding=utf-8
from __future__ import print_function, absolute_import, unicode_literals

#
# Copyright (c) Justus Winter <4winter@informatik.uni-hamburg.de>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#

import os
import glob
import logging
import functools
import subprocess

class ClassificationError(Exception): pass
class BackendError(ClassificationError): pass

default_db_path = os.path.join(os.environ.get('XDG_DATA_HOME',
                                              os.path.expanduser('~/.local/share')),
                               'afew', 'categories')

[docs]class Classifier(object): reference_category = 'reference_category' def __init__(self, categories, database_directory = default_db_path): self.categories = set(categories) self.database_directory = database_directory def learn(self, category, texts): pass def classify(self, text): pass
[docs]class DBACL(Classifier): def __init__(self, database_directory = default_db_path): categories = glob.glob1(database_directory, '*') super(DBACL, self).__init__(categories, database_directory) sane_environ = { key: value for key, value in os.environ.items() if not ( key.startswith('LC_') or key == 'LANG' or key == 'LANGUAGE' ) } def _call_dbacl(self, args, **kwargs): command_line = ['dbacl', '-T', 'email'] + args logging.debug('executing %r' % command_line) return subprocess.Popen( command_line, shell = False, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE, env = self.sane_environ, **kwargs ) def get_category_path(self, category): return os.path.join(self.database_directory, category.replace('/', '_')) def learn(self, category, texts): process = self._call_dbacl(['-l', self.get_category_path(category)]) for text in texts: process.stdin.write((text + '\n').encode('utf-8')) process.stdin.close() process.wait() if process.returncode != 0: raise BackendError('dbacl learning failed:\n%s' % process.stderr.read()) def classify(self, text): if not self.categories: raise ClassificationError('No categories defined') categories = functools.reduce(list.__add__, [ ['-c', self.get_category_path(category)] for category in self.categories ], []) process = self._call_dbacl(categories + ['-n']) stdout, stderr = process.communicate(text.encode('utf-8')) if len(stderr) == 0: result = stdout.split() scores = list() while result: category = result.pop(0).decode('utf-8', 'replace') score = float(result.pop(0)) scores.append((category, score)) scores.sort(key = lambda category_score: category_score[1]) else: raise BackendError('dbacl classification failed:\n%s' % stderr) return scores