Source code for goose3.article

# -*- coding: utf-8 -*-
"""\
This is a python port of "Goose" orignialy licensed to Gravity.com
under one or more contributor license agreements.  See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.

Python port was written by Xavier Grangier for Recrutae

Gravity.com licenses this file
to you under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance
with the License.  You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""


[docs]class Article(object):

    def __init__(self):
        self._title = ""
        self._cleaned_text = ""
        self._meta_description = ""
        self._meta_lang = ""
        self._meta_favicon = ""
        self._meta_keywords = ""
        self._meta_encoding = []
        self._canonical_link = ""
        self._domain = ""
        self._top_node = None
        self._top_image = None
        self._tags = []
        self._opengraph = {}
        self._tweets = []
        self._movies = []
        self._links = []
        self._authors = []
        self._final_url = ""
        self._link_hash = ""
        self._raw_html = ""
        self._schema = None
        self._doc = None
        self._raw_doc = None
        self._publish_date = None
        self._publish_datetime_utc = None
        self._additional_data = {}

    @property
    def title(self):
        ''' str: Title extracted from the HTML source

            Note:
                Read only '''
        return self._title

    @property
    def cleaned_text(self):
        ''' str: Cleaned text of the article without HTML tags; most commonly desired property

            Note:
                Read only '''
        return self._cleaned_text

    @property
    def meta_description(self):
        ''' str: Contents of the meta-description field from the HTML source

            Note:
                Read only '''
        return self._meta_description

    @property
    def meta_lang(self):
        ''' str: Contents of the meta-lang field from the HTML source

            Note:
                Read only '''
        return self._meta_lang

    @property
    def meta_favicon(self):
        ''' str: Contents of the meta-favicon field from the HTML source

            Note:
                Read only '''
        return self._meta_favicon

    @property
    def meta_keywords(self):
        ''' str: Contents of the meta-keywords field from the HTML source

            Note:
                Read only '''
        return self._meta_keywords

    @property
    def meta_encoding(self):
        ''' str: Contents of the encoding/charset field from the HTML source

            Note:
                Read only '''
        return self._meta_encoding

    @property
    def canonical_link(self):
        ''' str: The canonical link of the article if found in the meta data

            Note:
                Read only '''
        return self._canonical_link

    @property
    def domain(self):
        ''' str: Domain of the article parsed

            Note:
                Read only '''
        return self._domain

    @property
    def top_node(self):
        ''' etree: The top Element that is a candidate for the main body of the article

            Note:
                Read only '''
        return self._top_node

    @property
    def top_image(self):
        ''' Image: The top image object that likely represents the article

            Returns:
                Image: See more information on the goose3.Image class
            Note:
                Read only '''
        return self._top_image

    @property
    def tags(self):
        ''' list(str): List of article tags (non-metadata tags)

            Note:
                Read only '''
        return self._tags

    @property
    def opengraph(self):
        ''' dict: All opengraph tag data

            Note:
                Read only '''
        return self._opengraph

    @property
    def tweets(self):
        ''' list(str): A listing of embeded tweets in the article

            Note:
                Read only '''
        return self._tweets

    @property
    def movies(self):
        ''' list(Video): A listing of all videos within the article such as
            YouTube or Vimeo

            Returns:
                list(Video):  See more information on the goose3.Video class
            Note:
                Read only '''
        return self._movies

    @property
    def links(self):
        ''' list(str): A listing of URL links within the article

            Note:
                Read only '''
        return self._links

    @property
    def authors(self):
        ''' list(str): A listing of authors as parsed from the meta tags

            Note:
                Read only '''
        return self._authors

    @property
    def final_url(self):
        ''' str: The URL that was used to pull and parsed; `None` if raw_html was used
            and no url element was found.

            Note:
                Read only '''
        return self._final_url

    @property
    def link_hash(self):
        ''' str: The MD5 of the final url to be used for various identification tasks

            Note:
                Read only '''
        return self._link_hash

    @property
    def raw_html(self):
        ''' str: The HTML represented as a string

            Note:
                Read only '''
        return self._raw_html

    @property
    def doc(self):
        ''' etree: lxml document that is being processed

            Note:
                Read only '''
        return self._doc

    @property
    def raw_doc(self):
        ''' etree: Original, uncleaned, and untouched lxml document to be processed

            Note:
                Read only '''
        return self._raw_doc

    @property
    def schema(self):
        ''' dict: All schema tag data

            Note:
                Read only '''
        return self._schema

    @property
    def publish_date(self):
        ''' str: The date the article was published based on meta tag extraction

            Note:
                Read only '''
        return self._publish_date

    @property
    def publish_datetime_utc(self):
        ''' datetime.datetime: The date time version of the published date based on meta tag extraction \
            in the UTC timezone, if timezone information is known

            Note:
                Read only '''
        return self._publish_datetime_utc

    @property
    def additional_data(self):
        ''' dict: A property bucket for consumers of goose3 to store custom data extractions

            Note:
                Read only '''
        return self._additional_data

    @property
    def infos(self):
        ''' dict: The summation of all data available about the extracted article

            Note:
                Read only '''
        data = {
            "meta": {
                "description": self.meta_description,
                "lang": self.meta_lang,
                "keywords": self.meta_keywords,
                "favicon": self.meta_favicon,
                "canonical": self.canonical_link,
                "encoding": self.meta_encoding
            },
            "image": None,
            "domain": self.domain,
            "title": self.title,
            "cleaned_text": self.cleaned_text,
            "opengraph": self.opengraph,
            "tags": self.tags,
            "tweets": self.tweets,
            "movies": [],
            "links": self.links,
            "authors": self.authors,
            "publish_date": self.publish_date
        }

        # image
        if self.top_image is not None:
            data['image'] = {
                'url': self.top_image.src,
                'width': self.top_image.width,
                'height': self.top_image.height,
                'type': 'image'
            }

        # movies
        for movie in self.movies:
            data['movies'].append({
                'embed_type': movie.embed_type,
                'provider': movie.provider,
                'width': movie.width,
                'height': movie.height,
                'embed_code': movie.embed_code,
                'src': movie.src,
            })

        return data