# -*- coding: utf-8 -*-
"""\
This is a python port of "Goose" orignialy licensed to Gravity.com
under one or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.
Python port was written by Xavier Grangier for Recrutae
Gravity.com licenses this file
to you under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import os
import tempfile
from goose3.text import StopWords
from goose3.parsers import Parser, ParserSoup
from goose3.version import __version__
AVAILABLE_PARSERS = {
'lxml': Parser,
'soup': ParserSoup,
}
[docs]class ArticleContextPattern(object):
''' Help ensure correctly generated article context patterns
Args:
attr (str): The attribute type: class, id, etc
value (str): The value of the attribute
tag (str): The type of tag, such as `article` that contains the \
main article body
domain (str): The domain to which this pattern pertains (optional)
Note:
Must provide, at a minimum, (attr and value) or (tag)
'''
__slots__ = ['attr', 'value', 'tag', 'domain']
def __init__(self, *, attr=None, value=None, tag=None, domain=None):
if (not attr and not value) and not tag:
raise Exception("`attr` and `value` must be set or `tag` must be set")
self.attr = attr
self.value = value
self.tag = tag
self.domain = domain
def __repr__(self):
return "ArticleContextPattern(attr={} value={} tag={} domain={})".format(
self.attr, self.value, self.tag, self.domain)
KNOWN_ARTICLE_CONTENT_PATTERNS = [
ArticleContextPattern(attr='class', value='short-story'),
ArticleContextPattern(attr='itemprop', value='articleBody'),
ArticleContextPattern(attr='class', value='post-content'),
ArticleContextPattern(attr='class', value='g-content'),
ArticleContextPattern(attr='class', value='post-outer'),
ArticleContextPattern(tag='article'),
]
[docs]class PublishDatePattern(object):
''' Ensure correctly formed publish date patterns; to be used in conjuntion
with the configuration `known_publish_date_tags` property
Args:
attr (str): The attribute type: class, id, etc
value (str): The value of the attribute
content (str): The name of another attribute (of the element) that \
contains the value
subcontent (str): The name of a json object key (optional)
tag (str): The type of tag, such as `time` that contains the \
publish date
domain (str): The domain to which this pattern pertains (optional)
Note:
Must provide, at a minimum, (attr and value) or (tag)
'''
__slots__ = ['attr', 'value', 'content', 'subcontent', 'tag', 'domain']
def __init__(self, *, attr=None, value=None, content=None, subcontent=None,
tag=None, domain=None):
if (not attr and not value) and not tag:
raise Exception("`attr` and `value` must be set or `tag` must be set")
self.attr = attr
self.value = value
self.content = content
self.subcontent = subcontent
self.tag = tag
self.domain = domain
def __repr__(self):
if self.tag:
rpr = "PublishDatePattern(tag={}, attr={}, value={} domain={})"
return rpr.format(self.tag, self.attr, self.value, self.domain)
else:
rpr = "PublishDatePattern(attr={}, value={} content={} subcontent={} domain={})"
return rpr.format(self.attr, self.value, self.content, self.subcontent, self.domain)
KNOWN_PUBLISH_DATE_TAGS = [
PublishDatePattern(attr='property', value='rnews:datePublished', content='content'),
PublishDatePattern(attr='property', value='article:published_time', content='content'),
PublishDatePattern(attr='name', value='OriginalPublicationDate', content='content'),
PublishDatePattern(attr='itemprop', value='datePublished', content='datetime'),
PublishDatePattern(attr='name', value='published_time_telegram', content='content'),
PublishDatePattern(attr='name', value='parsely-page', content='content', subcontent='pub_date'),
PublishDatePattern(tag='time'),
PublishDatePattern(attr='itemprop', value='datePublished', content='content')
]
[docs]class AuthorPattern(object):
''' Ensures that the author patterns are correctly formed for use with the
`known_author_patterns` of configuration
Args:
attr (str): The attribute type: class, id, etc
value (str): The value of the attribute
content (str): The name of another attribute (of the element) that \
contains the value
tag (str): The type of tag, such as `author` that contains the \
author information
subpattern (str): A subpattern for elements within the main attribute
'''
__slots__ = ['attr', 'value', 'content', 'tag', 'subpattern']
def __init__(self, *, attr=None, value=None, content=None, tag=None, subpattern=None):
if (not attr and not value) and not tag:
raise Exception("`attr` and `value` must be set or `tag` must be set")
self.attr = attr
self.value = value
self.content = content
self.tag = tag
self.subpattern = subpattern
def __repr__(self):
if self.tag:
rpr = "AuthorPattern(tag={}, attr={}, value={})"
return rpr.format(self.tag, self.attr, self.value)
else:
rpr = "AuthorPattern(attr={}, value={} content={} subpattern={})"
return rpr.format(self.attr, self.value, self.content, self.subpattern)
KNOWN_AUTHOR_PATTERNS = [
AuthorPattern(attr='itemprop', value='author', subpattern=AuthorPattern(attr='itemprop', value='name')),
AuthorPattern(attr='name', value='author', content='content')
]
[docs]class Configuration(object):
def __init__(self):
# parser information
self._available_parsers = list(AVAILABLE_PARSERS.keys())
self._parser_class = 'lxml'
# URL extraction parameters
self._browser_user_agent = 'Goose/%s' % __version__
self._http_timeout = 30.0
self._http_auth = None
self._http_proxies = None
self._http_headers = None
# extraction information
self._local_storage_path = os.path.join(tempfile.gettempdir(), 'goose')
self._known_context_patterns = KNOWN_ARTICLE_CONTENT_PATTERNS[:]
self._known_publish_date_tags = KNOWN_PUBLISH_DATE_TAGS[:]
self._known_author_patterns = KNOWN_AUTHOR_PATTERNS[:]
self._target_language = 'en'
self._use_meta_language = True
# general configuration
self._strict = True
self._debug = False
self._stopwords_class = StopWords
# imagemagick executable paths
self._imagemagick_convert_path = "/opt/local/bin/convert" # Not used
self._imagemagick_identify_path = "/opt/local/bin/identify" # not used
# image extraction
self._enable_image_fetching = False
self._images_min_bytes = 4500
# Do we need to allow setting one's own ImageExtractor class?
self._parse_lists = True
self._pretty_lists = True
self._parse_headers = True
self._keep_footnotes = True
@property
def known_context_patterns(self):
''' list: The context patterns to search to find the likely article content
Note:
Each entry must be a dictionary with the following keys: `attr` and `value` \
or just `tag`
'''
return self._known_context_patterns
@known_context_patterns.setter
def known_context_patterns(self, val):
''' val must be an ArticleContextPattern, a dictionary, or list of \
dictionaries
e.g., {'attr': 'class', 'value': 'my-article-class'}
or [{'attr': 'class', 'value': 'my-article-class'},
{'attr': 'id', 'value': 'my-article-id'}]
'''
def create_pat_from_dict(val):
'''Helper function used to create an ArticleContextPattern from a dictionary
'''
if "tag" in val:
pat = ArticleContextPattern(tag=val["tag"])
if "attr" in val:
pat.attr = val["attr"]
pat.value = val["value"]
elif "attr" in val:
pat = ArticleContextPattern(attr=val["attr"], value=val["value"])
if "domain" in val:
pat.domain = val["domain"]
return pat
if isinstance(val, list):
self._known_context_patterns = [
x if isinstance(x, ArticleContextPattern) else create_pat_from_dict(x)
for x in val
] + self.known_context_patterns
elif isinstance(val, ArticleContextPattern):
self._known_context_patterns.insert(0, val)
elif isinstance(val, dict):
self._known_context_patterns.insert(0, create_pat_from_dict(val))
else:
raise Exception("Unknown type: {}. Use a ArticleContextPattern.".format(type(val)))
@property
def known_publish_date_tags(self):
''' list: The tags to search to find the likely published date
Note:
Each entry must be a dictionary with the following keys: `attribute`, `value`, \
and `content`.
'''
return self._known_publish_date_tags
@known_publish_date_tags.setter
def known_publish_date_tags(self, val):
''' val must be a dictionary or list of dictionaries
e.g., {'attrribute': 'name', 'value': 'my-pubdate', 'content': 'datetime'}
or [{'attrribute': 'name', 'value': 'my-pubdate', 'content': 'datetime'},
{'attrribute': 'property', 'value': 'pub_time', 'content': 'content'}]
'''
def create_pat_from_dict(val):
'''Helper function used to create an PublishDatePattern from a dictionary
'''
if "tag" in val:
pat = PublishDatePattern(tag=val["tag"])
if "attribute" in val:
pat.attr = val["attribute"]
pat.value = val["value"]
elif "attribute" in val:
pat = PublishDatePattern(attr=val["attribute"], value=val["value"],
content=val["content"])
if "subcontent" in val:
pat.subcontent = val["subcontent"]
if "domain" in val:
pat.domain = val["domain"]
return pat
if isinstance(val, list):
self._known_publish_date_tags = [
x if isinstance(x, PublishDatePattern) else create_pat_from_dict(x)
for x in val
] + self.known_publish_date_tags
elif isinstance(val, PublishDatePattern):
self._known_publish_date_tags.insert(0, val)
elif isinstance(val, dict):
self._known_publish_date_tags.insert(0, create_pat_from_dict(val))
else:
raise Exception("Unknown type: {}. Use a PublishDatePattern.".format(type(val)))
@property
def known_author_patterns(self):
''' list: The tags to search to find the likely published date
Note:
Each entry must be a dictionary with the following keys: `attribute`, `value`, \
and `content`.
'''
return self._known_author_patterns
@known_author_patterns.setter
def known_author_patterns(self, val):
''' val must be a dictionary or list of dictionaries
e.g., {'attrribute': 'name', 'value': 'my-pubdate', 'content': 'datetime'}
or [{'attrribute': 'name', 'value': 'my-pubdate', 'content': 'datetime'},
{'attrribute': 'property', 'value': 'pub_time', 'content': 'content'}]
'''
def create_pat_from_dict(val):
'''Helper function used to create an AuthorPatterns from a dictionary
'''
if "tag" in val:
pat = AuthorPattern(tag=val["tag"])
if "attribute" in val:
pat.attr = val["attribute"]
pat.value = val["value"]
elif "attribute" in val:
pat = AuthorPattern(attr=val["attribute"], value=val["value"],
content=val["content"])
if "subpattern" in val:
pat.subpattern = create_pat_from_dict(val["subpattern"])
return pat
if isinstance(val, list):
self._known_author_patterns = [
x if isinstance(x, AuthorPattern) else create_pat_from_dict(x)
for x in val
] + self.known_author_patterns
elif isinstance(val, AuthorPattern):
self._known_author_patterns.insert(0, val)
elif isinstance(val, dict):
self._known_author_patterns.insert(0, create_pat_from_dict(val))
else:
raise Exception("Unknown type: {}. Use an AuthorPattern.".format(type(val)))
@property
def strict(self):
''' bool: Enable `strict mode` and throw exceptions instead of
swallowing them.
Note:
Defaults to `True` '''
return self._strict
@strict.setter
def strict(self, val):
''' set the strict property '''
self._strict = bool(val)
@property
def http_timeout(self):
''' float: The time delay to pass to `requests` to wait for the response
in seconds
Note:
Defaults to 30.0 '''
return self._http_timeout
@http_timeout.setter
def http_timeout(self, val):
''' set the http_timeout property '''
self._http_timeout = float(val)
@property
def local_storage_path(self):
''' str: The local path to store temporary files
Note:
Defaults to the value of `os.path.join(tempfile.gettempdir(), 'goose')` '''
return self._local_storage_path
@local_storage_path.setter
def local_storage_path(self, val):
''' set the local_storage_path property '''
self._local_storage_path = val
@property
def debug(self):
''' bool: Turn on or off debugging
Note:
Defaults to `False`
Warning:
Debugging is currently not implemented '''
return self._debug
@debug.setter
def debug(self, val):
''' set the debug property '''
self._debug = bool(val)
@property
def parser_class(self):
''' str: The key of the parser to use
Note:
Defaults to `lxml` '''
return self._parser_class
@parser_class.setter
def parser_class(self, val):
''' set the parser_class property '''
self._parser_class = val
@property
def available_parsers(self):
''' list(str): A list of all possible parser values for the parser_class
Note:
Not settable '''
return self._available_parsers
@property
def http_auth(self):
''' tuple: Authentication class and information to pass to the requests
library
See Also:
`Requests Authentication <http://docs.python-requests.org/en/master/user/authentication/>`__
'''
return self._http_auth
@http_auth.setter
def http_auth(self, val):
''' set the http_auth property '''
self._http_auth = val
@property
def http_proxies(self):
''' dict: Proxy information to pass directly to the supporting `requests` object
See Also:
`Requests Proxy Support <http://docs.python-requests.org/en/master/user/advanced/#proxies>`__
'''
return self._http_proxies
@http_proxies.setter
def http_proxies(self, val):
''' set the http_proxies property '''
self._http_proxies = val
@property
def http_headers(self):
''' dict: Custom headers to pass directly to the supporting `requests` object
See Also:
`Requests Custom Headers <http://docs.python-requests.org/en/master/user/quickstart/#custom-headers>`__
'''
return self._http_headers
@http_headers.setter
def http_headers(self, val):
''' set the http_headers property '''
self._http_headers = val
@property
def browser_user_agent(self):
''' Browser user agent string to use when making URL requests
Note:
Defaults to `Goose/{goose3.__version__}`
Examples:
Using the non-standard browser agent string is advised when pulling
frequently
>>> config.browser_user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2)'
>>> config.browser_user_agent = 'AppleWebKit/534.52.7 (KHTML, like Gecko)'
>>> config.browser_user_agent = 'Version/5.1.2 Safari/534.52.7'
'''
return self._browser_user_agent
@browser_user_agent.setter
def browser_user_agent(self, val):
''' set the browser user agent string '''
self._browser_user_agent = val
@property
def imagemagick_identify_path(self):
''' str: Path to the identify program that is part of imagemagick
Note:
Defaults to `"/opt/local/bin/identify"`
Warning:
Currently not used / implemented '''
return self._imagemagick_identify_path
@imagemagick_identify_path.setter
def imagemagick_identify_path(self, val):
''' set the imagemagick identify program path '''
self._imagemagick_identify_path = val
@property
def imagemagick_convert_path(self):
''' str: Path to the convert program that is part of imagemagick
Note:
Defaults to `"/opt/local/bin/convert"`
Warning:
Currently not used / implemented '''
return self._imagemagick_convert_path
@imagemagick_convert_path.setter
def imagemagick_convert_path(self, val):
''' set the imagemagick convert program path '''
self._imagemagick_convert_path = val
@property
def stopwords_class(self):
''' StopWords: The StopWords class to use when analyzing article content
Note:
Defaults to the english stop words
Note:
Current stop words available in `goose3.text` include: \n
`StopWords`, `StopWordsChinese`, `StopWordsArabic`, and `StopWordsKorean`
'''
return self._stopwords_class
@stopwords_class.setter
def stopwords_class(self, val):
''' set the stopwords class to use '''
# TODO: add a check to see if a valid class is provided!
self._stopwords_class = val
@property
def target_language(self):
''' str: The default target language if the language is not extractable
or if use_meta_language is set to False
Note:
Default language is 'en'
'''
return self._target_language
@target_language.setter
def target_language(self, val):
''' set the target language property '''
self._target_language = val
@property
def use_meta_language(self):
''' bool: Determine if language should be extracted from the meta tags
or not. If this is set to `False` then the target_language will be
used. Also, if extraction fails then the target_language will be
utilized.
Note:
Defaults to `True` '''
return self._use_meta_language
@use_meta_language.setter
def use_meta_language(self, val):
''' set the use_meta_language property '''
self._use_meta_language = bool(val)
@property
def enable_image_fetching(self):
''' bool: Turn on or off image extraction
Note:
Defaults to `False` '''
return self._enable_image_fetching
@enable_image_fetching.setter
def enable_image_fetching(self, val):
''' set the enable_image_fetching property '''
self._enable_image_fetching = bool(val)
@property
def images_min_bytes(self):
''' int: Minimum number of bytes for an image to be evaluated to be the
main image of the site
Note:
Defaults to 4500 bytes '''
return self._images_min_bytes
@images_min_bytes.setter
def images_min_bytes(self, val):
''' set the images_min_bytes property '''
self._images_min_bytes = int(val)
@property
def pretty_lists(self):
''' bool: Specify if lists should be pretty printed in the cleaned_text
output
Note:
Defaults to `True` '''
return self._pretty_lists
@pretty_lists.setter
def pretty_lists(self, val):
''' set if lists should be pretty printed '''
self._pretty_lists = bool(val)
@property
def parse_lists(self):
return self._parse_lists
@parse_lists.setter
def parse_lists(self, val):
''' set if headers should be parsed '''
self._parse_lists = bool(val)
@property
def parse_headers(self):
''' bool: Specify if headers should be pulled or not in the cleaned_text
output
Note:
Defaults to `True`'''
return self._parse_headers
@parse_headers.setter
def parse_headers(self, val):
''' set if headers should be parsed '''
self._parse_headers = bool(val)
@property
def keep_footnotes(self):
''' bool: Specify if footnotes should be kept or not in the cleaned_text
output
Note:
Defaults to `True`'''
return self._keep_footnotes
@keep_footnotes.setter
def keep_footnotes(self, val):
''' set if headers should be parsed '''
self._keep_footnotes = bool(val)
[docs] def get_parser(self):
''' Retrieve the current parser class to use for extraction
Returns:
Parser: The parser to use '''
return AVAILABLE_PARSERS[self.parser_class]