Source code for goose3

"""
This is a python port of "Goose" orignialy licensed to Gravity.com
under one or more contributor license agreements.  See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.

Python port was written by Xavier Grangier for Recrutae

Gravity.com licenses this file
to you under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance
with the License.  You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import logging
import os
import weakref
from tempfile import mkstemp
from typing import List, Union

from goose3.article import Article  # noqa: F401
from goose3.configuration import ArticleContextPattern, AuthorPattern, Configuration, PublishDatePattern  # noqa: F401
from goose3.crawler import CrawlCandidate, Crawler
from goose3.image import Image  # noqa: F401
from goose3.network import NetworkFetcher
from goose3.video import Video  # noqa: F401

logger = logging.getLogger(__name__)


[docs] class Goose: """Extract most likely article content and aditional metadata from a URL or previously fetched HTML document Args: config (Configuration, dict): A configuration file or dictionary representation of the configuration file Returns: Goose: An instance of the goose extraction object""" def __init__(self, config: Union[Configuration, dict, None] = None): # Use the passed in configuration if it is of the right type, otherwise # use the default as a base if isinstance(config, Configuration): self.config = config else: self.config = Configuration() # if config was a passed in dict, parse it into the stored configuration if isinstance(config, dict): for k, v in config.items(): if hasattr(self.config, k): setattr(self.config, k, v) # setup a single network connection self.fetcher = NetworkFetcher(self.config) self.finalizer = weakref.finalize(self, self.close) # we don't need to go further if image extractor or local_storage is not set if not self.config.local_storage_path or not self.config.enable_image_fetching: return # test if config.local_storage_path is a directory if not os.path.isdir(self.config.local_storage_path): os.makedirs(self.config.local_storage_path) if not os.path.isdir(self.config.local_storage_path): msg = ( f"{self.config.local_storage_path} directory does not seem to exist, " "you need to set this for image processing downloads" ) raise Exception(msg) # test to write a dummy file to the directory to check is directory is writable level, path = mkstemp(dir=self.config.local_storage_path) try: with os.fdopen(level, "w"): pass os.remove(path) except OSError as exc: msg = ( f"{self.config.local_storage_path} directory is not writeble, " "you need to set this for image processing downloads" ) raise Exception(msg) from exc def __enter__(self): """Setup the context manager""" return self def __exit__(self, exc_type, exc_val, exc_tb): """Define what to do when the context manager exits""" self.close()
[docs] def close(self): """Close the network connection and perform any other required cleanup Note: Auto closed when using goose as a context manager or when garbage collected""" if self.fetcher is not None: self.shutdown_network() self.finalizer.atexit = False # turn off the garbage collection close
[docs] def extract(self, url: Union[str, None] = None, raw_html: Union[str, None] = None) -> Article: """Extract the most likely article content from the html page Args: url (str): URL to pull and parse raw_html (str): String representation of the HTML page Returns: Article: Representation of the article contents including other parsed and extracted metadata""" if not url and not raw_html: raise ValueError("Either url or raw_html should be provided") if url is None and raw_html is None: raise ValueError("Either url or raw_html should be provided") crawl_candidate = CrawlCandidate(self.config, url, raw_html) return self.__crawl(crawl_candidate)
[docs] def shutdown_network(self): """Close the network connection Note: Auto closed when using goose as a context manager or when garbage collected""" self.fetcher.close() self.fetcher = None
def __crawl(self, crawl_candidate: CrawlCandidate): """wrap the crawling functionality""" def crawler_wrapper(parser: str, parsers: List[str], crawl_candidate: CrawlCandidate): try: crawler = Crawler(self.config, self.fetcher) article = crawler.crawl(crawl_candidate) except (UnicodeDecodeError, ValueError) as ex: logger.error("Parser %s failed to parse the content", parser) if parsers: parser = parsers.pop(0) # remove it also! return crawler_wrapper(parser, parsers, crawl_candidate) raise ex return article # use the wrapper parsers = list(self.config.available_parsers) parsers.remove(self.config.parser_class) return crawler_wrapper(self.config.parser_class, parsers, crawl_candidate)