# -*- coding: utf-8 -*-
"""\
This is a python port of "Goose" orignialy licensed to Gravity.com
under one or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.
Python port was written by Xavier Grangier for Recrutae
Gravity.com licenses this file
to you under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
[docs]class Article(object):
def __init__(self):
self._title = ""
self._cleaned_text = ""
self._meta_description = ""
self._meta_lang = ""
self._meta_favicon = ""
self._meta_keywords = ""
self._meta_encoding = []
self._canonical_link = ""
self._domain = ""
self._top_node = None
self._top_image = None
self._tags = []
self._opengraph = {}
self._tweets = []
self._movies = []
self._links = []
self._authors = []
self._final_url = ""
self._link_hash = ""
self._raw_html = ""
self._schema = None
self._doc = None
self._raw_doc = None
self._publish_date = None
self._publish_datetime_utc = None
self._additional_data = {}
@property
def title(self):
''' str: Title extracted from the HTML source
Note:
Read only '''
return self._title
@property
def cleaned_text(self):
''' str: Cleaned text of the article without HTML tags; most commonly desired property
Note:
Read only '''
return self._cleaned_text
@property
def meta_description(self):
''' str: Contents of the meta-description field from the HTML source
Note:
Read only '''
return self._meta_description
@property
def meta_lang(self):
''' str: Contents of the meta-lang field from the HTML source
Note:
Read only '''
return self._meta_lang
@property
def meta_favicon(self):
''' str: Contents of the meta-favicon field from the HTML source
Note:
Read only '''
return self._meta_favicon
@property
def meta_keywords(self):
''' str: Contents of the meta-keywords field from the HTML source
Note:
Read only '''
return self._meta_keywords
@property
def meta_encoding(self):
''' str: Contents of the encoding/charset field from the HTML source
Note:
Read only '''
return self._meta_encoding
@property
def canonical_link(self):
''' str: The canonical link of the article if found in the meta data
Note:
Read only '''
return self._canonical_link
@property
def domain(self):
''' str: Domain of the article parsed
Note:
Read only '''
return self._domain
@property
def top_node(self):
''' etree: The top Element that is a candidate for the main body of the article
Note:
Read only '''
return self._top_node
@property
def top_image(self):
''' Image: The top image object that likely represents the article
Returns:
Image: See more information on the goose3.Image class
Note:
Read only '''
return self._top_image
@property
def tags(self):
''' list(str): List of article tags (non-metadata tags)
Note:
Read only '''
return self._tags
@property
def opengraph(self):
''' dict: All opengraph tag data
Note:
Read only '''
return self._opengraph
@property
def tweets(self):
''' list(str): A listing of embeded tweets in the article
Note:
Read only '''
return self._tweets
@property
def movies(self):
''' list(Video): A listing of all videos within the article such as
YouTube or Vimeo
Returns:
list(Video): See more information on the goose3.Video class
Note:
Read only '''
return self._movies
@property
def links(self):
''' list(str): A listing of URL links within the article
Note:
Read only '''
return self._links
@property
def authors(self):
''' list(str): A listing of authors as parsed from the meta tags
Note:
Read only '''
return self._authors
@property
def final_url(self):
''' str: The URL that was used to pull and parsed; `None` if raw_html was used
and no url element was found.
Note:
Read only '''
return self._final_url
@property
def link_hash(self):
''' str: The MD5 of the final url to be used for various identification tasks
Note:
Read only '''
return self._link_hash
@property
def raw_html(self):
''' str: The HTML represented as a string
Note:
Read only '''
return self._raw_html
@property
def doc(self):
''' etree: lxml document that is being processed
Note:
Read only '''
return self._doc
@property
def raw_doc(self):
''' etree: Original, uncleaned, and untouched lxml document to be processed
Note:
Read only '''
return self._raw_doc
@property
def schema(self):
''' dict: All schema tag data
Note:
Read only '''
return self._schema
@property
def publish_date(self):
''' str: The date the article was published based on meta tag extraction
Note:
Read only '''
return self._publish_date
@property
def publish_datetime_utc(self):
''' datetime.datetime: The date time version of the published date based on meta tag extraction \
in the UTC timezone, if timezone information is known
Note:
Read only '''
return self._publish_datetime_utc
@property
def additional_data(self):
''' dict: A property bucket for consumers of goose3 to store custom data extractions
Note:
Read only '''
return self._additional_data
@property
def infos(self):
''' dict: The summation of all data available about the extracted article
Note:
Read only '''
data = {
"meta": {
"description": self.meta_description,
"lang": self.meta_lang,
"keywords": self.meta_keywords,
"favicon": self.meta_favicon,
"canonical": self.canonical_link,
"encoding": self.meta_encoding
},
"image": None,
"domain": self.domain,
"title": self.title,
"cleaned_text": self.cleaned_text,
"opengraph": self.opengraph,
"tags": self.tags,
"tweets": self.tweets,
"movies": [],
"links": self.links,
"authors": self.authors,
"publish_date": self.publish_date
}
# image
if self.top_image is not None:
data['image'] = {
'url': self.top_image.src,
'width': self.top_image.width,
'height': self.top_image.height,
'type': 'image'
}
# movies
for movie in self.movies:
data['movies'].append({
'embed_type': movie.embed_type,
'provider': movie.provider,
'width': movie.width,
'height': movie.height,
'embed_code': movie.embed_code,
'src': movie.src,
})
return data