text.py 2.38 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import spacy


class NameFinder:
    """
    A class for text analysis, for name finding.
    """
    def __init__(self):
        self.nlp = spacy.load('xx_ent_wiki_sm')  # loads this model for identifying names

    def get_persons(self, text):
        """
        Creates a list of person names from a text.
        :param text: The text in which this functions searches for names.
        :return: list of found person names.
        """
        doc = self.nlp(text)

        raw_persons = [entity.text for entity in doc.ents if entity.label_ == 'PER']
20
21
22
        # TODO improve this process
        # make sure we get no single names and no extremely large names
        filtered_persons = list(filter(lambda p: len(p.split(' ')) > 1 and len(p.split(' ')) < 5, raw_persons))
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

        return filtered_persons

    def get_organisations(self, text):
        """
        Creates a list of organisation names from a text.
        :param text: The text in which this functions searches for names.
        :return: list of found organisation names.
        """
        doc = self.nlp(text)

        organisations = [entity.text for entity in doc.ents if entity.label_ == 'ORG']

        return organisations


class KeywordFinder:
    """
    A class for text analysis, for keyword finding.
    """
    positive_keywords = ['Speaker',
                         'Invite']

    neutral_keywords = [ 'Conference',
                         'Paper',
                         'University']

    negative_keywords = ['Organize',
                         'Committee']

    @staticmethod
    def get_keyword_count(text: str):
        """
        Counts and returns the number of found keywords.
        :param text: The text in which this functions searches for keywords.
        :return: dicts of keywords with number of occurrences for each category of keywords
        """
        positive_keyword_counts = dict()
        neutral_keyword_counts = dict()
        negative_keyword_counts = dict()
        for k in KeywordFinder.positive_keywords:
            positive_keyword_counts[k] = text.lower().count(k.lower())
        for k in KeywordFinder.neutral_keywords:
            neutral_keyword_counts[k] = text.lower().count(k.lower())
        for k in KeywordFinder.negative_keywords:
            negative_keyword_counts[k] = text.lower().count(k.lower())
        return positive_keyword_counts, neutral_keyword_counts, negative_keyword_counts