Module `NSEDownload.scraper`

Expand source code

import datetime
import json
import math
import pandas as pd
import re
import requests
import threading
from bs4 import BeautifulSoup
from io import StringIO
from requests.adapters import HTTPAdapter, Retry

from NSEDownload.static_data import get_headers, get_adjusted_headers, get_symbol_mapping_url, get_company_events_url, \
    get_symbol_count_url

interim_dfs = []


def process_window(stage, url):
    response = make_get_request(url)
    try:
        interim_dfs[stage] = process_html_response(response)
    except AttributeError:
        pass


def process_html_response(response):
    page_content = BeautifulSoup(response, "html.parser")
    lines = page_content.find(id="csvContentDiv").get_text()
    lines = lines.replace(':', ", \n")
    df = pd.read_csv(StringIO(lines))
    df.set_index("Date", inplace=True)
    df = df[::-1]
    return df


def make_get_request(url):
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))
    response = session.get(url, timeout=10, headers=get_headers())
    session.close()
    return response.content


def scrape_data(start_date, end_date, request_type,
                index_name=None, url=None, stock_symbol=None, symbol_count=None, series="EQ"):
    """Called by stocks and indices to scrape data. Create threads for different requests, parses data, combines them and returns dataframe

    Args:
        start_date (datetime): start date
        end_date (datetime): end date
        request_type (str): Either 'stock' or 'index'
        index_name (str, optional): If type index then this gives name of index. Defaults to None.
        url (str, optional): URL to scrape from. Defaults to None.
        stock_symbol (str, optional): If type stock then this gives stock symbol. Defaults to None.
        symbol_count (str, optional): Intermediate variable needed for scraping. Defaults to None.
        series(str, optional): By default set to EQ, but can choose any series or All

    Returns:
        Pandas DataFrame: df containing data for stocksymbol for provided date range
    """

    total_stages = math.ceil((end_date - start_date).days / 365)
    global interim_dfs
    interim_dfs = [pd.DataFrame()] * total_stages

    threads = []
    for stage in range(total_stages):

        window_start_date = start_date + stage * datetime.timedelta(days=365)
        window_end_date = window_start_date + datetime.timedelta(days=364)

        if window_start_date > end_date:
            break

        if window_end_date > end_date:
            window_end_date = end_date

        if request_type == 'stock':
            final_url = get_symbol_mapping_url() + '?symbol=' + stock_symbol + '&segmentLink=3&symbolCount' \
                        + symbol_count + "&series=" + series + "&dateRange=+&fromDate=" + \
                        window_start_date.strftime(
                            "%d-%m-%Y") + "&toDate=" + window_end_date.strftime(
                "%d-%m-%Y") + "&dataType=PRICEVOLUMEDELIVERABLE"

        if request_type == 'index':
            final_url = url + '?indexType=' + index_name + \
                        '&fromDate=' + \
                        window_start_date.strftime("%d-%m-%Y") + '&toDate=' + \
                        window_end_date.strftime("%d-%m-%Y")

        thread = threading.Thread(target=process_window, args=[stage, final_url])
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    result = pd.DataFrame()
    for stage in range(total_stages):
        result = pd.concat([result, interim_dfs[stage]])
    result.index = pd.to_datetime(result.index)
    result.sort_index(inplace=True)

    return result


def add_quotes_to_field(match):
    match = match.group()
    return match[0] + '"' + match[1:-1] + '":'


def scrape_bonus_splits(symbol):
    """Scrapes for bonuses and splits

    Args:
        symbol (str): Stock Symbol

    Returns:
        list: Returns list of dates of event and ratio of original and new price
    """

    event_dates, event_ratio = [], []
    url_more_than_24 = get_company_events_url() + symbol + \
                       "&Industry=&ExDt=More%20than%2024%20" \
                       "Months&exDt=More%20than%2024%20Months" \
                       "&recordDt=&bcstartDt=&industry" \
                       "=&CAType="

    url_last_24 = get_company_events_url() + symbol + \
                  "&Industry=&ExDt=Last%2012%20Months" \
                  "&exDt=Last%2012%20Months&" \
                  "&recordDt=&bcstartDt=&industry" \
                  "=&CAType="

    for url in [url_more_than_24, url_last_24]:

        response = requests.get(url, timeout=60, headers=get_adjusted_headers())
        page_content = "{" + BeautifulSoup(response.content, "html.parser").text.replace('\n', '').replace('\t', '')[16:]
        json_input = re.sub(r'[{,][a-zA-Z]+:', add_quotes_to_field, page_content)
        json_content = json.loads(json_input)
        corporate_actions = json_content["rows"]

        for row in corporate_actions:

            subject = row["sub"].lower()
            date = row["exDt"]
            if date not in event_dates:

                # Scraping for Splits
                if subject.find("split") != -1 or subject.find("division") != -1:
                    num = re.findall('\d+', subject)
                    if len(num) < 2:
                        continue
                    event_ratio.append(int(num[0]) / int(num[1]))
                    event_dates.append(date)
                    print("Split event on: " + date)

                # Scraping for Bonus
                if subject.find("bonus") != -1:
                    num = re.findall('\d+', subject)
                    if len(num) < 2:
                        continue
                    event_ratio.append((int(num[0]) + int(num[1])) / int(num[1]))
                    event_dates.append(date)
                    print("Bonus event on: " + date)

    return [event_ratio, event_dates]


def scrape_symbol(symbol):
    """Scraping intermediate variable symbol Count

    Args:
        symbol (str): Stock Symbol

    Raises:
        SystemExit: Exit on any exception to request

    Returns:
        str: Symbol Count
    """

    try:
        response = requests.post(get_symbol_count_url(),
                                 data={"symbol": symbol},
                                 headers=get_headers(),
                                 timeout=60)

    except requests.exceptions.RequestException as e:
        raise SystemExit(e)

    if response.status_code != requests.codes.ok:
        response.raise_for_status()

    return str(BeautifulSoup(response.content, "html.parser"))

Functions

def add_quotes_to_field(match)

Expand source code

def add_quotes_to_field(match):
    match = match.group()
    return match[0] + '"' + match[1:-1] + '":'

def make_get_request(url)

Expand source code

def make_get_request(url):
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))
    response = session.get(url, timeout=10, headers=get_headers())
    session.close()
    return response.content

def process_html_response(response)

Expand source code

def process_html_response(response):
    page_content = BeautifulSoup(response, "html.parser")
    lines = page_content.find(id="csvContentDiv").get_text()
    lines = lines.replace(':', ", \n")
    df = pd.read_csv(StringIO(lines))
    df.set_index("Date", inplace=True)
    df = df[::-1]
    return df

def process_window(stage, url)

Expand source code

def process_window(stage, url):
    response = make_get_request(url)
    try:
        interim_dfs[stage] = process_html_response(response)
    except AttributeError:
        pass

def scrape_bonus_splits(symbol)

Scrapes for bonuses and splits

Args

symbol : str: Stock Symbol

Returns

list: Returns list of dates of event and ratio of original and new price

Expand source code

def scrape_bonus_splits(symbol):
    """Scrapes for bonuses and splits

    Args:
        symbol (str): Stock Symbol

    Returns:
        list: Returns list of dates of event and ratio of original and new price
    """

    event_dates, event_ratio = [], []
    url_more_than_24 = get_company_events_url() + symbol + \
                       "&Industry=&ExDt=More%20than%2024%20" \
                       "Months&exDt=More%20than%2024%20Months" \
                       "&recordDt=&bcstartDt=&industry" \
                       "=&CAType="

    url_last_24 = get_company_events_url() + symbol + \
                  "&Industry=&ExDt=Last%2012%20Months" \
                  "&exDt=Last%2012%20Months&" \
                  "&recordDt=&bcstartDt=&industry" \
                  "=&CAType="

    for url in [url_more_than_24, url_last_24]:

        response = requests.get(url, timeout=60, headers=get_adjusted_headers())
        page_content = "{" + BeautifulSoup(response.content, "html.parser").text.replace('\n', '').replace('\t', '')[16:]
        json_input = re.sub(r'[{,][a-zA-Z]+:', add_quotes_to_field, page_content)
        json_content = json.loads(json_input)
        corporate_actions = json_content["rows"]

        for row in corporate_actions:

            subject = row["sub"].lower()
            date = row["exDt"]
            if date not in event_dates:

                # Scraping for Splits
                if subject.find("split") != -1 or subject.find("division") != -1:
                    num = re.findall('\d+', subject)
                    if len(num) < 2:
                        continue
                    event_ratio.append(int(num[0]) / int(num[1]))
                    event_dates.append(date)
                    print("Split event on: " + date)

                # Scraping for Bonus
                if subject.find("bonus") != -1:
                    num = re.findall('\d+', subject)
                    if len(num) < 2:
                        continue
                    event_ratio.append((int(num[0]) + int(num[1])) / int(num[1]))
                    event_dates.append(date)
                    print("Bonus event on: " + date)

    return [event_ratio, event_dates]

def scrape_data(start_date, end_date, request_type, index_name=None, url=None, stock_symbol=None, symbol_count=None, series='EQ')

Called by stocks and indices to scrape data. Create threads for different requests, parses data, combines them and returns dataframe

Args

start_date : datetime: start date
end_date : datetime: end date
request_type : str: Either 'stock' or 'index'
index_name : str, optional: If type index then this gives name of index. Defaults to None.
url : str, optional: URL to scrape from. Defaults to None.
stock_symbol : str, optional: If type stock then this gives stock symbol. Defaults to None.
symbol_count : str, optional: Intermediate variable needed for scraping. Defaults to None.

series(str, optional): By default set to EQ, but can choose any series or All

Returns

Pandas DataFrame: df containing data for stocksymbol for provided date range

Expand source code

def scrape_data(start_date, end_date, request_type,
                index_name=None, url=None, stock_symbol=None, symbol_count=None, series="EQ"):
    """Called by stocks and indices to scrape data. Create threads for different requests, parses data, combines them and returns dataframe

    Args:
        start_date (datetime): start date
        end_date (datetime): end date
        request_type (str): Either 'stock' or 'index'
        index_name (str, optional): If type index then this gives name of index. Defaults to None.
        url (str, optional): URL to scrape from. Defaults to None.
        stock_symbol (str, optional): If type stock then this gives stock symbol. Defaults to None.
        symbol_count (str, optional): Intermediate variable needed for scraping. Defaults to None.
        series(str, optional): By default set to EQ, but can choose any series or All

    Returns:
        Pandas DataFrame: df containing data for stocksymbol for provided date range
    """

    total_stages = math.ceil((end_date - start_date).days / 365)
    global interim_dfs
    interim_dfs = [pd.DataFrame()] * total_stages

    threads = []
    for stage in range(total_stages):

        window_start_date = start_date + stage * datetime.timedelta(days=365)
        window_end_date = window_start_date + datetime.timedelta(days=364)

        if window_start_date > end_date:
            break

        if window_end_date > end_date:
            window_end_date = end_date

        if request_type == 'stock':
            final_url = get_symbol_mapping_url() + '?symbol=' + stock_symbol + '&segmentLink=3&symbolCount' \
                        + symbol_count + "&series=" + series + "&dateRange=+&fromDate=" + \
                        window_start_date.strftime(
                            "%d-%m-%Y") + "&toDate=" + window_end_date.strftime(
                "%d-%m-%Y") + "&dataType=PRICEVOLUMEDELIVERABLE"

        if request_type == 'index':
            final_url = url + '?indexType=' + index_name + \
                        '&fromDate=' + \
                        window_start_date.strftime("%d-%m-%Y") + '&toDate=' + \
                        window_end_date.strftime("%d-%m-%Y")

        thread = threading.Thread(target=process_window, args=[stage, final_url])
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    result = pd.DataFrame()
    for stage in range(total_stages):
        result = pd.concat([result, interim_dfs[stage]])
    result.index = pd.to_datetime(result.index)
    result.sort_index(inplace=True)

    return result

def scrape_symbol(symbol)

Scraping intermediate variable symbol Count

Args

symbol : str: Stock Symbol

Raises

SystemExit: Exit on any exception to request

Returns

str: Symbol Count

Expand source code

def scrape_symbol(symbol):
    """Scraping intermediate variable symbol Count

    Args:
        symbol (str): Stock Symbol

    Raises:
        SystemExit: Exit on any exception to request

    Returns:
        str: Symbol Count
    """

    try:
        response = requests.post(get_symbol_count_url(),
                                 data={"symbol": symbol},
                                 headers=get_headers(),
                                 timeout=60)

    except requests.exceptions.RequestException as e:
        raise SystemExit(e)

    if response.status_code != requests.codes.ok:
        response.raise_for_status()

    return str(BeautifulSoup(response.content, "html.parser"))