Module cmc.modules.base
Module for storing settings for Selenium and requests used by other cmc-py modules.
A random User-Agent and a proxy is used for requests session and Selenium driver in order to circumvent an IP ban. Data is scraped through Selenium (to load JavaScript components) and BeautifulSoup (to parse website data).
Expand source code
#!/usr/bin/env python
"""Module for storing settings for Selenium and requests used by
other cmc-py modules.
A random User-Agent and a proxy is used for requests session and Selenium
driver in order to circumvent an IP ban. Data is scraped through Selenium
(to load JavaScript components) and BeautifulSoup (to parse website data).
"""
import os
import random
import re
import time
from typing import Dict, Optional
import requests
from requests.structures import CaseInsensitiveDict
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.proxy import Proxy, ProxyType
from webdriver_manager.chrome import ChromeDriverManager
from cmc.resources.user_agents import user_agents
from cmc.utils.exceptions import ProxyTimeOut, InvalidProxy
class CMCBaseClass:
"""Class for basic Selenium and requests settings for cmc-py
modules. Sets up a random User-Agent and a random proxy each
time the class is called.
"""
def __init__(self, proxy: Optional[str]):
"""
Args:
proxy (Optional[str]): Proxy to be used for Selenium and requests Session.
"""
self.current_dir = os.path.dirname(os.path.realpath(__file__))
self.parent_dir = os.path.dirname(self.current_dir)
self.cmc_url = "https://coinmarketcap.com"
self.__proxy_url_1 = "https://public.freeproxyapi.com/api/Proxy/ProxyByType/0/4"
self.__proxy_url_2 = "http://pubproxy.com/api/proxy?https=true"
self.headers = CaseInsensitiveDict({"User-Agent": self.__get_random_user_agent})
self.session = requests.Session()
self.session.headers = self.headers
self.proxy: str = self.__get_proxy if proxy is None else proxy
self.__check_proxy
self.session.proxies = {"https": self.proxy}
self.selenium_proxy = Proxy()
self.selenium_proxy.proxy_type = ProxyType.MANUAL
self.selenium_proxy.http_proxy = (
self.selenium_proxy.socks_proxy
) = self.selenium_proxy.ssl_proxy = self.proxy
self.driver_options = webdriver.ChromeOptions()
self.driver_options.Proxy = self.selenium_proxy
self.driver_options.add_argument("headless")
self.driver_options.add_argument("--no-sandbox")
self.driver_options.add_argument("--log-level=3")
self.driver_options.add_argument("ignore-certificate-errors")
self.driver_options.add_experimental_option(
"excludeSwitches", ["enable-logging"]
)
self.service = Service(ChromeDriverManager(log_level=0).install())
@property
def __get_proxy(self) -> str:
"""Fetch a random HTTPS proxy for using with Selenium.
Raises:
ProxyTimeOut: Raised when a proxy cannot be fetched from the API.
Returns:
str: Fetched proxy from the API.
"""
try:
result = self.session.get(self.__proxy_url_1).json()
proxy: str = result["host"] + ":" + str(result["port"])
time.sleep(1.5)
return proxy
except:
try:
result = self.session.get(self.__proxy_url_2).json() # type: ignore
proxy: str = result["data"][0]["ipPort"] # type: ignore
time.sleep(1.5)
return proxy
except:
raise ProxyTimeOut
@property
def __get_random_user_agent(self) -> str:
"""Fetch a random User-Agent for using with requests
Session.
Returns:
str: User-Agent for requests Session header.
"""
result: str = random.choice(user_agents)
return result
@property
def __check_proxy(self) -> None:
"""Check whether the proxy (IP:Port) is valid or not.
Raises:
InvalidProxy: Raised if the proxy is not valid.
"""
regex = re.compile(
r"^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]):[0-9]+$",
re.IGNORECASE,
)
if not regex.search(self.proxy):
raise InvalidProxy(self.proxy)
return
Classes
class CMCBaseClass (proxy: Optional[str])
-
Class for basic Selenium and requests settings for cmc-py modules. Sets up a random User-Agent and a random proxy each time the class is called.
Args
proxy
:Optional[str]
- Proxy to be used for Selenium and requests Session.
Expand source code
class CMCBaseClass: """Class for basic Selenium and requests settings for cmc-py modules. Sets up a random User-Agent and a random proxy each time the class is called. """ def __init__(self, proxy: Optional[str]): """ Args: proxy (Optional[str]): Proxy to be used for Selenium and requests Session. """ self.current_dir = os.path.dirname(os.path.realpath(__file__)) self.parent_dir = os.path.dirname(self.current_dir) self.cmc_url = "https://coinmarketcap.com" self.__proxy_url_1 = "https://public.freeproxyapi.com/api/Proxy/ProxyByType/0/4" self.__proxy_url_2 = "http://pubproxy.com/api/proxy?https=true" self.headers = CaseInsensitiveDict({"User-Agent": self.__get_random_user_agent}) self.session = requests.Session() self.session.headers = self.headers self.proxy: str = self.__get_proxy if proxy is None else proxy self.__check_proxy self.session.proxies = {"https": self.proxy} self.selenium_proxy = Proxy() self.selenium_proxy.proxy_type = ProxyType.MANUAL self.selenium_proxy.http_proxy = ( self.selenium_proxy.socks_proxy ) = self.selenium_proxy.ssl_proxy = self.proxy self.driver_options = webdriver.ChromeOptions() self.driver_options.Proxy = self.selenium_proxy self.driver_options.add_argument("headless") self.driver_options.add_argument("--no-sandbox") self.driver_options.add_argument("--log-level=3") self.driver_options.add_argument("ignore-certificate-errors") self.driver_options.add_experimental_option( "excludeSwitches", ["enable-logging"] ) self.service = Service(ChromeDriverManager(log_level=0).install()) @property def __get_proxy(self) -> str: """Fetch a random HTTPS proxy for using with Selenium. Raises: ProxyTimeOut: Raised when a proxy cannot be fetched from the API. Returns: str: Fetched proxy from the API. """ try: result = self.session.get(self.__proxy_url_1).json() proxy: str = result["host"] + ":" + str(result["port"]) time.sleep(1.5) return proxy except: try: result = self.session.get(self.__proxy_url_2).json() # type: ignore proxy: str = result["data"][0]["ipPort"] # type: ignore time.sleep(1.5) return proxy except: raise ProxyTimeOut @property def __get_random_user_agent(self) -> str: """Fetch a random User-Agent for using with requests Session. Returns: str: User-Agent for requests Session header. """ result: str = random.choice(user_agents) return result @property def __check_proxy(self) -> None: """Check whether the proxy (IP:Port) is valid or not. Raises: InvalidProxy: Raised if the proxy is not valid. """ regex = re.compile( r"^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]):[0-9]+$", re.IGNORECASE, ) if not regex.search(self.proxy): raise InvalidProxy(self.proxy) return
Subclasses