# all the imports

from chat_downloader import ChatDownloader
from api_keys import *
import numpy as np
import math
import pandas as pd
import requests
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime
from dateutil import parser
import seaborn as sns
from scipy import stats


# data from currency api 4/27/2021
# https://v6.exchangerate-api.com/v6/475c6e98cbc47ea829182edf/latest/USD
# LIMITATION: uses exchange rates from this given date, which is not necessarily accurate as conversion rates fluctuate.
# Could fix by using historical data, like some libraries do, but said libraries do not support most of the currencies
# that Youtube allows. Since the exchange rates don't actually change very much it's not a big deal.
CONVERSION_RATES = {"USD":1,"AED":3.6725,"AFN":78.1250,"ALL":101.8953,"AMD":520.4600,"ANG":1.7900,"AOA":654.5061,"ARS":93.1530,"AUD":1.2842,"AWG":1.7900,"AZN":1.7004,"BAM":1.6173,"BBD":2.0000,"BDT":84.8896,"BGN":1.6176,"BHD":0.3760,"BIF":1949.0714,"BMD":1.0000,"BND":1.3250,"BOB":6.8734,"BRL":5.4645,"BSD":1.0000,"BTN":74.8250,"BWP":10.8219,"BYN":2.5626,"BZD":2.0000,"CAD":1.2406,"CDF":1980.3907,"CHF":0.9149,"CLP":713.3027,"CNY":6.4838,"COP":3623.1336,"CRC":613.3445,"CUC":1.0000,"CUP":25.7500,"CVE":91.1784,"CZK":21.4154,"DJF":177.7210,"DKK":6.1690,"DOP":56.9402,"DZD":133.1381,"EGP":15.6777,"ERN":15.0000,"ETB":41.9764,"EUR":0.8269,"FJD":2.0310,"FKP":0.7190,"FOK":6.1690,"GBP":0.7190,"GEL":3.4495,"GGP":0.7190,"GHS":5.7681,"GIP":0.7190,"GMD":51.9242,"GNF":9948.7342,"GTQ":7.7035,"GYD":213.3299,"HKD":7.7609,"HNL":23.9751,"HRK":6.2303,"HTG":85.2293,"HUF":301.6614,"IDR":14461.3437,"ILS":3.2543,"IMP":0.7190,"INR":74.8254,"IQD":1457.8722,"IRR":41971.7970,"ISK":124.9323,"JMD":152.3990,"JOD":0.7090,"JPY":108.0579,"KES":108.1244,"KGS":84.7655,"KHR":4045.3231,"KID":1.2842,"KMF":406.8094,"KRW":1112.4162,"KWD":0.2996,"KYD":0.8333,"KZT":430.1103,"LAK":9416.1959,"LBP":1507.5000,"LKR":193.5438,"LRD":172.5626,"LSL":14.2630,"LYD":4.4773,"MAD":8.8904,"MDL":17.9561,"MGA":3773.8878,"MKD":51.0540,"MMK":1410.4465,"MNT":2847.8345,"MOP":7.9937,"MRU":35.9286,"MUR":40.3427,"MVR":15.3019,"MWK":790.3944,"MXN":19.8544,"MYR":4.0993,"MZN":55.9805,"NAD":14.2630,"NGN":397.9828,"NIO":35.1494,"NOK":8.2981,"NPR":119.7199,"NZD":1.3846,"OMR":0.3845,"PAB":1.0000,"PEN":3.7996,"PGK":3.5027,"PHP":48.3724,"PKR":153.7474,"PLN":3.7758,"PYG":6466.9532,"QAR":3.6400,"RON":4.0718,"RSD":97.2763,"RUB":74.8769,"RWF":999.0229,"SAR":3.7500,"SBD":7.9734,"SCR":13.9238,"SDG":379.0468,"SEK":8.3875,"SGD":1.3250,"SHP":0.7190,"SLL":10216.8942,"SOS":578.0796,"SRD":14.1435,"SSP":177.5625,"STN":20.2591,"SYP":1263.4391,"SZL":14.2630,"THB":31.4548,"TJS":11.3094,"TMT":3.5001,"TND":2.7331,"TOP":2.2623,"TRY":8.3072,"TTD":6.7912,"TVD":1.2842,"TWD":27.8733,"TZS":2314.9588,"UAH":27.8700,"UGX":3599.2768,"UYU":44.0618,"UZS":10621.6327,"VES":2721326.1200,"VND":23041.9420,"VUV":108.5327,"WST":2.5109,"XAF":542.4126,"XCD":2.7000,"XDR":0.6957,"XOF":542.4126,"XPF":98.6759,"YER":250.5638,"ZAR":14.2627,"ZMW":22.2229}
CONVERSION_RATES["₱"] = CONVERSION_RATES["PHP"]
MEMBERSHIP_VALUE = 4.99

def get_superchats(url):
    '''string url -> tuple (# of superchats, $ value of superchats, superchat_timestamps)'''
    chat = ChatDownloader().get_chat(url,message_groups=['superchat'])       # create a generator
    total_superchat_earnings_usd = 0.0
    total_num_chats = 0
    timestamps = []
    for message in chat:                        # iterate over messages
        if message['message_type'] == 'paid_message' or message['message_type'] == 'paid_sticker':
            # superchat
            message_money = message['money']['amount']
            message_currency = message['money']['currency']
            usd_msg = message_money / CONVERSION_RATES[message_currency]
            #print("message got %f %s = %f USD"%(message_money, message_currency, usd_msg))
            total_superchat_earnings_usd += usd_msg
            timestamps.append(message["timestamp"])
        elif message['message_type'] == 'membership_item':
            total_superchat_earnings_usd += MEMBERSHIP_VALUE
            timestamps.append(message["timestamp"])
        else:
            print("unknown message type: ", message['message_type'])
            print(message)
        total_num_chats += 1
    return (total_num_chats, total_superchat_earnings_usd, timestamps)

def get_last_50_videos(channel_id):
    api_str = "https://youtube.googleapis.com/youtube/v3/channels?part=contentDetails&id={0}&key={1}"
    deet = api_str.format(channel_id, YOUTUBE_API_KEY)
    z = requests.get(deet).json()
    # the "uploads"
    #print(z)
    uploads_id = z["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
    #print(uploads_id)

    videos_list_p1 = "https://www.googleapis.com/youtube/v3/playlistItems?part=contentDetails&maxResults=50&playlistId={0}&key={1}"
    deet2 = videos_list_p1.format(uploads_id, YOUTUBE_API_KEY)
    channel_last_50_videos = requests.get(deet2).json()
    return channel_last_50_videos

def get_video_details(vid_id):
    vid_details_template = "https://youtube.googleapis.com/youtube/v3/videos?part=liveStreamingDetails%2Cstatistics%2Cstatus%2CtopicDetails%2Clocalizations%2Csnippet%2CcontentDetails&id={0}&key={1}"
    deet3 = vid_details_template.format(vid_id, YOUTUBE_API_KEY)
    video_details = requests.get(deet3).json()
    return video_details

def get_all_vids_details(channel_videos):
    out = []
    for i in channel_videos["items"]:
        vid_id = i["contentDetails"]["videoId"]
        a = get_video_details(vid_id)
        out.append(a)
    return out


# dataframe will have columns:
# channel name | channel id | video name | video id | description | publishedAt | video start time | video end time |
# video length | # superchats | $ superchats | localization | viewcount | tags (topicdetails) | timestamps of each superchat


c_names = pd.read_csv("vtuber_channels.csv", names=["vtuber_name", "affiliation", "channel_id"]);
df = pd.DataFrame(columns = ['channel_name', 'channel_id', 'video_name', 'video_id', 'description', 'published_at', 
                             'video_start_time', 'video_end_time', 'video_length', 'num_superchats', 'val_superchats',
                            'locale', 'viewcount', 'tags', 'timestamps']);

c_names.head()


videos_processed = 0
for index, row in c_names.iterrows():
    # this if statement was used in parallelization, so that each process would be operating on different vtubers.
    if index <= 40:
        continue
    print(row['vtuber_name'], row['channel_id'])
    
    # get their videos
    last_50_vids = get_last_50_videos(row['channel_id'])
    all_vids_details = get_all_vids_details(last_50_vids)
    
    for vid in all_vids_details:
        
        item = vid["items"][0]
        try:
            data_we_want = [row['vtuber_name'], row['channel_id'], 
                            item['snippet']['title'],
                            item['id'],
                            item['snippet']['description'],
                            item['snippet']['publishedAt'],
                            item['liveStreamingDetails']['actualStartTime'],
                            item['liveStreamingDetails']['actualEndTime'],
                            0, # video length, to be computed later using subtraction, its also ['contentDetails']['duration']
                            0, # num_superchats, to be computed next
                            0.0, # val_superchats, to be computed next
                            'ja', # language
                            item['statistics']['viewCount'],
                            [], # topic
                            [] # superchat timestamps - to be computed next
                           ]
        
            if 'defaultAudioLanguage' in item['snippet']:
                data_we_want[11] = item['snippet']['defaultAudioLanguage']

            if 'topicDetails' in item and 'topicCategories' in item['topicDetails']:
                data_we_want[13] = [i.replace('https://en.wikipedia.org/wiki/','') for i in item['topicDetails']['topicCategories']]
            
            yt_url = 'https://www.youtube.com/watch?v='+item['id']
            total_num_chats, total_superchat_earnings_usd, timestamps = get_superchats(yt_url)
            data_we_want[9] = total_num_chats
            data_we_want[10] = total_superchat_earnings_usd
            data_we_want[14] = timestamps
            df.loc[len(df)] = data_we_want
            videos_processed += 1
            print(videos_processed, data_we_want)
            
        except Exception as e:
            print(e)
            continue
    
    df.to_hdf('./data_h5s/data_'+str(index)+'.h5', key='df', mode='w')


df1 = pd.read_hdf("./data_h5s/data_10.h5")
df2 = pd.read_hdf("./data_h5s/data_20.h5")
df3 = pd.read_hdf("./data_h5s/data_30.h5")
df4 = pd.read_hdf("./data_h5s/data_40.h5")
df5 = pd.read_hdf("./data_h5s/data_49.h5")
df1.head()


c_names = pd.read_hdf("./h5s/vtubers.h5")

# let's sort it by subs count (ascending)
c_names["subs_count"] = pd.to_numeric(c_names["subs_count"])
c_names = c_names.sort_values(by=['subs_count'], ignore_index=True)
c_names.head()


for cur_df in [df1, df2, df3, df4, df5]:

    vals_to_add = []
    for index, row in cur_df.iterrows():
        vid = row["video_id"]
        vals_to_add.append('N/A')
        if "game" in "".join(row["tags"]):
            # the row is a game
            z = requests.get('https://www.youtube.com/watch?v={0}'.format(vid)).text
            soup = BeautifulSoup(z, 'html.parser')
            res = soup.findAll('script')
            for i in res:
                #print(i)
                if 'var ytInitialData = ' not in str(i):
                    continue
                try:
                    # get the relevant field from the json
                    json_data = json.loads(i.contents[0].lstrip('var ytInitialData = ').rstrip(';'))
                    value = json_data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"]["metadataRowContainer"]["metadataRowContainerRenderer"]["rows"][0]["richMetadataRowRenderer"]["contents"][0]["richMetadataRenderer"]["title"]["simpleText"]
                    vals_to_add[-1] = value
                    print(value)
                    break
                except:
                    pass
    cur_df["game_name"] = vals_to_add


joined = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)


joined.to_hdf('h5s/joined.h5', key='df', mode='w')


joined = pd.read_hdf('h5s/joined.h5')
joined.head()


joined['timestamps'] = joined['timestamps'].apply(lambda x: [datetime.utcfromtimestamp(i/1000000) for i in x])
joined.head()


joined['published_at'] = joined['published_at'].apply(parser.isoparse)
joined["video_start_time"] = joined["video_start_time"].apply(parser.isoparse)
joined["video_end_time"] = joined["video_end_time"].apply(parser.isoparse)


joined.head()


for ind, row in joined.iterrows():
    joined.at[ind, "video_length"] = (row["video_end_time"]-row["video_start_time"])/np.timedelta64(1,'m')
    
joined["viewcount"] = pd.to_numeric(joined["viewcount"])
joined["video_length"] = pd.to_numeric(joined["video_length"])
joined["num_superchats"] = pd.to_numeric(joined["num_superchats"])

joined["stream_start_hour"] = joined.apply(lambda row: row["video_start_time"].hour, axis=1)
joined["stream_end_hour"] = joined.apply(lambda row: row["video_end_time"].hour, axis=1)
joined["is_gaming"] = joined.apply(lambda row: int('game' in ''.join(row["tags"])), axis=1)
joined["average_superchat_value"] = joined.apply(lambda row: row["val_superchats"]/row["num_superchats"] if row["num_superchats"] > 0 else 0, axis=1)


def vtuber_to_ordinal(vt):
    return c_names[c_names['vtuber_name'] == vt].index[0]

def affiliation_to_ordinal(vt):
    ordinal_list = ["independent", "voms", "holostars", "nijisanji", "hololive_id", "hololive_en", "hololive"]
    st = c_names[c_names['vtuber_name'] == vt].iloc[0].affiliation
    return ordinal_list.index(st)

joined["vtuber_ordinal"] = joined.apply(lambda row: vtuber_to_ordinal(row["channel_name"]), axis=1)
joined["affiliation_ordinal"] = joined.apply(lambda row: affiliation_to_ordinal(row["channel_name"]), axis=1)

joined.head()


plt.figure(figsize=(9,6))
sns.heatmap(joined.corr(), annot=True, cmap='Blues')
plt.show()


plt.figure(figsize=(9,6))
sns.heatmap(joined.corr("spearman"), annot=True, cmap='Blues')
plt.show()


axes = scatter_matrix(joined, figsize=(12, 10))
for ax in axes.flatten():
    ax.yaxis.label.set_rotation(60)
    ax.yaxis.label.set_ha('right')
plt.tight_layout()
plt.show()


ENGLISH_VTUBERS = ["gawr_gura",
"takanashi_kiara",
"mori_calliope",
"ninomae_inanis",
"watson_amelia"
]

english_vtubers = joined.loc[joined['channel_name'].isin(ENGLISH_VTUBERS)]


plt.figure(figsize=(9,6))
sns.heatmap(english_vtubers.corr(), annot=True, cmap='Blues')
plt.show()


from datetime import time,date, timedelta
# bar chart for all superchats
datetimes = {}
schats = {}
# 00:00 to 23:59

ENGLISH_VTUBERS = ["gawr_gura",
"takanashi_kiara",
"mori_calliope",
"ninomae_inanis",
"watson_amelia"
]

# this is a probably inefficient way of finding times when streams were live
# pick all non-english vtubers (note the ~)
for ind, row in joined.loc[~joined['channel_name'].isin(ENGLISH_VTUBERS)].iterrows():
    
    ct = row["video_start_time"]
    while ct <= row["video_end_time"]:
        # use a random fake date because matplotlib needs an actual date to plot time series
        t = datetime.combine(date(2000, 1, 1), 
                          ct.replace(second=0).time())
        if t in datetimes:
            datetimes[t]+= 1
        else:
            datetimes[t]=1
        ct += timedelta(minutes=1)
        
    
    for i in row["timestamps"]:
        t1 = ct.hour
        if t1 in schats:
            schats[t1]+= 1
        else:
            schats[t1]=1
#print(datetimes)
print(schats)
x,y = zip(*sorted(datetimes.items()))
x2,y2 = zip(*sorted(schats.items()))
fig, ax = plt.subplots(figsize=(15, 10));
ax.plot(x,y)

ax2 = ax.twiny().twinx()
ax2.set_xticks([])
ax2.tick_params('y', colors='b')
#ax2.set_ylim(top=8000)
ax2.bar(x2, y2, color='orange', alpha=0.5)

xfmt = mdates.DateFormatter('%H:%M')
ax.set_title("Streams and Superchats - non-EN")
ax.set_xlabel("Time of day")
ax.set_ylabel("Number of streams live")
ax2.set_ylabel("Number of superchats in hour")
ax.xaxis.set_major_locator(mdates.HourLocator())
ax.xaxis.set_major_formatter(xfmt)

{15: 30799, 11: 10680, 2: 2095, 14: 27858, 10: 7658, 13: 25165, 17: 15549, 3: 3815, 6: 4598, 5: 4323, 12: 28380, 9: 6158, 8: 3170, 16: 26075, 19: 8146, 7: 5199, 1: 1084, 0: 2550, 20: 4475, 18: 10434, 4: 2431, 21: 4563, 23: 3492, 22: 2915}


from datetime import time,date, timedelta
# bar chart for all superchats
datetimes = {}
schats = {}
# 00:00 to 23:59

ENGLISH_VTUBERS = ["gawr_gura",
"takanashi_kiara",
"mori_calliope",
"ninomae_inanis",
"watson_amelia"
]

# this is a probably inefficient way of finding times when streams were live
# pick only these 5 vtubers
for ind, row in joined.loc[joined['channel_name'].isin(ENGLISH_VTUBERS)].iterrows():
    
    ct = row["video_start_time"]
    while ct <= row["video_end_time"]:
        # use a random fake date because matplotlib needs an actual date to plot time series
        t = datetime.combine(date(2000, 1, 1), 
                          ct.replace(second=0).time())
        if t in datetimes:
            datetimes[t]+= 1
        else:
            datetimes[t]=1
        ct += timedelta(minutes=1)
        
    
    for i in row["timestamps"]:
        t1 = ct.hour
        if t1 in schats:
            schats[t1]+= 1
        else:
            schats[t1]=1
#print(datetimes)
print(schats)
x,y = zip(*sorted(datetimes.items()))
x2,y2 = zip(*sorted(schats.items()))
fig, ax = plt.subplots(figsize=(15, 10));
ax.plot(x,y)

ax2 = ax.twiny().twinx()
ax2.set_xticks([])
ax2.tick_params('y', colors='b')
#ax2.set_ylim(top=8000)
ax2.bar(x2, y2, color='orange', alpha=0.5)

xfmt = mdates.DateFormatter('%H:%M')
ax.set_title("Streams and Superchats - EN")
ax.set_xlabel("Time of day")
ax.set_ylabel("Number of streams live")
ax2.set_ylabel("Number of superchats in hour")
ax.xaxis.set_major_locator(mdates.HourLocator())
ax.xaxis.set_major_formatter(xfmt)

{5: 6097, 1: 2661, 4: 4379, 0: 2704, 3: 3950, 6: 4434, 7: 1540, 2: 2065, 14: 554, 22: 2267, 19: 1934, 15: 2457, 21: 1442, 20: 1674, 23: 3306, 18: 620, 16: 3725, 13: 1025, 17: 1095, 8: 338, 12: 100}


fig, ax = plt.subplots(figsize=(15, 10))
ax.plot(joined["viewcount"] ,joined["val_superchats"], 'o', alpha = 0.5)

ax.set_xlabel("View Count")
ax.set_ylabel("Superchat Earnings (USD)")
ax.set_title("View Count vs Superchat Earnings (linear scale)")

Text(0.5, 1.0, 'View Count vs Superchat Earnings (linear scale)')


fig, ax = plt.subplots(figsize=(15, 10))
ax.plot(joined["viewcount"] ,joined["val_superchats"], 'o', alpha = 0.5)
ax.set_yscale('log')
ax.set_xscale('log')

ax.set_xlabel("View Count")
ax.set_ylabel("Superchat Earnings (USD)")
ax.set_title("View Count vs Superchat Earnings (log-log scale)")

Text(0.5, 1.0, 'View Count vs Superchat Earnings (log-log scale)')


# use scipy's linear regression, because it gives us a lot of associated statistics

slope, intercept, r_value, p_value, std_err = stats.linregress(joined["viewcount"],joined["val_superchats"])
print('Regression from scipy y=mx+b: \ny = %f x + %f'%(slope, intercept))
print('p-value: %e'%(p_value))
print('r-value: %f'%(r_value))
print('standard error: %f'%(std_err))

Regression from scipy y=mx+b: 
y = 0.005616 x + 310.870274
p-value: 2.826482e-87
r-value: 0.402350
standard error: 0.000271


fig, ax = plt.subplots(figsize=(15, 10))
plt.xlim([10**3, 5*10**6])
ax.plot(joined["viewcount"] ,joined["val_superchats"], 'o', alpha=0.5)
ax.set_yscale('log')
ax.set_xscale('log')

ax.set_xlabel("View Count")
ax.set_ylabel("Superchat Earnings (USD)")
ax.set_title("View Count vs Superchat Earnings (log-log scale)")
x = np.array(range(10**3, 10**7, 5))
ax.plot(x, slope*x + intercept);

plt.show()


temp_df = joined[["viewcount","val_superchats"]]
temp_df = temp_df[temp_df['val_superchats']>0]
temp_df['viewcount'] = temp_df['viewcount'].apply(math.log)
temp_df['val_superchats'] = temp_df['val_superchats'].apply(math.log)

slope, intercept, r_value, p_value, std_err = stats.linregress(temp_df["viewcount"],temp_df["val_superchats"])
print('Regression from scipy y=mx+b: \ny = %f x + %f'%(slope, intercept))
print('p-value: %e'%(p_value))
print('r-value: %f'%(r_value))
print('r^2: %f'%(r_value**2))
print('standard error: %f'%(std_err))

Regression from scipy y=mx+b: 
y = 0.758142 x + -2.713375
p-value: 1.969003e-138
r-value: 0.502581
r^2: 0.252588
standard error: 0.028093


fig, ax = plt.subplots(figsize=(15, 10))
plt.xlim([10**3, 5*10**6])
ax.plot(joined["viewcount"] ,joined["val_superchats"], 'o', alpha=0.5)
ax.set_yscale('log')
ax.set_xscale('log')

ax.set_xlabel("View Count")
ax.set_ylabel("Superchat Earnings (USD)")
ax.set_title("View Count vs Superchat Earnings (log-log scale)")
x = np.array(range(10**3, 10**7, 5))
# convert the slope and intercept to the relevant values in the right places
print("a = ", math.e**intercept)
print("k = ", slope)
ax.plot(x, (math.e**intercept)*x**slope);

plt.show()

a =  0.06631264243217902
k =  0.7581418841466804


fig, ax = plt.subplots(figsize=(15, 10))
plt.xlim([7, 15])
ax.plot(temp_df["viewcount"] ,temp_df["val_superchats"], 'o', alpha=0.5)

ax.set_xlabel("View Count")
ax.set_ylabel("Superchat Earnings (USD)")
ax.set_title("View Count vs Superchat Earnings (linear scale, after taking log of both vars)")
x = np.array([i/10.0 for i in range(70,160)])
# convert the slope and intercept to the relevant values in the right places
ax.plot(x, slope*x+intercept);

plt.show()


print(len(temp_df))
fig, ax = plt.subplots(figsize=(15, 10))
residuals = []
for ind, row in temp_df.iterrows():
    residuals.append(row["val_superchats"] - slope*row["viewcount"]-intercept)
#print(residuals)
ax.plot(temp_df["viewcount"] ,residuals, 'o', alpha=0.5)
#plt.yscale('symlog')
ax.set_xlabel("View Count")
ax.set_ylabel("Superchat Earnings (USD)")
ax.set_title("Residuals (log-log)")
ax.axhline(color="gray")
plt.show()

2157


vc2 = temp_df["viewcount"].apply(lambda x: round(x/2, 0)*2)

sns.violinplot(x=vc2, y=residuals)

<AxesSubplot:xlabel='viewcount'>


fig, ax = plt.subplots(figsize=(15, 10))
residuals = []
for ind, row in joined.iterrows():
    residuals.append(row["val_superchats"] - (math.e**intercept)*row["viewcount"]**slope)
#print(residuals)
ax.plot(joined["viewcount"] ,residuals, 'o', alpha=0.5)
#plt.yscale('symlog')
ax.set_xscale('log')
ax.set_xlabel("View Count")
ax.set_ylabel("Superchat Earnings (USD)")
ax.set_title("Residuals")

plt.show()


games_only = joined.loc[joined['game_name'] != 'N/A']
temp_df = games_only[["viewcount","val_superchats"]]
temp_df = temp_df[temp_df['val_superchats']>0]
temp_df['viewcount'] = temp_df['viewcount'].apply(math.log)
temp_df['val_superchats'] = temp_df['val_superchats'].apply(math.log)

slope_g, intercept_g, r_value_g, p_value_g, std_err_g = stats.linregress(temp_df["viewcount"],temp_df["val_superchats"])
print('Regression from scipy y=mx+b: \ny = %f x + %f'%(slope_g, intercept_g))
print('p-value: %e'%(p_value_g))
print('r-value: %f'%(r_value_g))
print('standard error: %f'%(std_err_g))


fig, ax = plt.subplots(figsize=(15, 10))
plt.xlim([10**3, 5*10**6])
ax.plot(games_only["viewcount"] ,games_only["val_superchats"], 'o', alpha=0.5)
ax.set_yscale('log')
ax.set_xscale('log')

ax.set_xlabel("View Count")
ax.set_ylabel("Superchat Earnings (USD)")
ax.set_title("View Count vs Superchat Earnings (Games only, log-log scale)")
x = np.array(range(10**3, 10**7, 5))
# convert the slope and intercept to the relevant values in the right places
print("a = ", math.e**intercept_g)
print("k = ", slope_g)
ax.plot(x, (math.e**intercept)*x**slope, color='C1', label="old regression line");
ax.plot(x, (math.e**intercept_g)*x**slope_g, color='C2', label="games regression line");
plt.legend(loc='upper left');


plt.show()

Regression from scipy y=mx+b: 
y = 0.957162 x + -5.281012
p-value: 8.104118e-133
r-value: 0.631815
standard error: 0.034170
a =  0.005087282402439689
k =  0.9571623123318653


print(len(temp_df))
fig, ax = plt.subplots(figsize=(15, 10))
residuals = []
for ind, row in temp_df.iterrows():
    residuals.append(row["val_superchats"] - slope_g*row["viewcount"]-intercept_g)
#print(residuals)
ax.plot(temp_df["viewcount"] ,residuals, 'o', alpha=0.5)
#plt.yscale('symlog')
ax.set_xlabel("View Count")
ax.set_ylabel("Superchat Earnings (USD)")
ax.set_title("Residuals (log-log)")
ax.axhline(color="gray")
plt.show()

1183


dd = {}
for ind, row in joined.iterrows():
    gn = row["game_name"]
    if gn != 'N/A':
        if gn in dd:
            dd[gn] += 1
        else:
            dd[gn] = 1
            

popular_games = [i for i in dd if dd[i]>=25]
popular_games

['Monster Hunter Rise',
 'Minecraft',
 'Among Us',
 'Apex Legends',
 'Uma Musume Pretty Derby',
 'ARK: Survival Evolved']


colors_chart = ["#A6BDD7", "#FFB300", "#803E75", "#FF6800", "#C10020", "#007D34", "#F6768E", "#CEA262", "#817066"]

all_games = joined.loc[joined['game_name'] != 'N/A']

fig, ax = plt.subplots(figsize=(15, 10))
ax.plot(all_games["viewcount"], all_games["val_superchats"], 'o', color = colors_chart[0], alpha=0.5, label="Other games")
ax.set_yscale('log')
ax.set_xscale('log')

ax.set_xlabel("View Count")
ax.set_ylabel("Superchat Earnings (USD)")
ax.set_title("View Count vs Superchat Earnings (GAMES ONLY / double log scale)")
ind = 1
for game in popular_games:
    cur_game = joined.loc[joined['game_name'] == game]
    ax.plot(cur_game["viewcount"], cur_game["val_superchats"], 'o', color = colors_chart[ind], label=game)
    ind += 1
    
handles, labels = ax.get_legend_handles_labels()
handles = handles[1:] + [handles[0]]
labels = labels[1:] + [labels[0]]
ax.legend(handles, labels, loc='upper left');
plt.show()


ind = 1
regs = []
x = np.array(range(10**3, 10**7, 5))
for game in popular_games:
    fig, ax = plt.subplots(figsize=(15, 10))
    ax.plot(all_games["viewcount"], all_games["val_superchats"], 'o', color = colors_chart[0], alpha=0.5, label="Other games")
    ax.set_yscale('log')
    ax.set_xscale('log')

    ax.set_xlabel("View Count")
    ax.set_ylabel("Superchat Earnings (USD)")
    ax.set_title("View Count vs Superchat Earnings ({0} only / log-log scale)".format(game))


    cur_game = joined.loc[joined['game_name'] == game]
    
    temp_cg_df = cur_game[["viewcount","val_superchats"]]
    temp_cg_df = temp_cg_df[temp_cg_df['val_superchats']>0]
    temp_cg_df['viewcount'] = temp_cg_df['viewcount'].apply(math.log)
    temp_cg_df['val_superchats'] = temp_cg_df['val_superchats'].apply(math.log)

    slope_t, intercept_t, r_value_t, p_value_t, std_err_t = stats.linregress(temp_cg_df["viewcount"],temp_cg_df["val_superchats"])
    print("a = ", math.e**intercept_t)
    print("k = ", slope_t)
    print('p-value: %e'%(p_value_t))
    print('r-value: %f'%(r_value_t))
    print('standard error: %f'%(std_err_t))
    print('num_points: %d'%(len(temp_cg_df)))
    
    ax.plot(x, (math.e**intercept_t)*x**slope_t, color=colors_chart[ind]);
    regs.append((intercept_t,slope_t))
    
    ax.plot(cur_game["viewcount"], cur_game["val_superchats"], 'o', color = colors_chart[ind], label=game)
    ind += 1
    handles, labels = ax.get_legend_handles_labels()
    handles = handles[1:] + [handles[0]]
    labels = labels[1:] + [labels[0]]
    ax.legend(handles, labels, loc='upper left');
    plt.show()

a =  0.0022374775669243094
k =  1.0140693210453922
p-value: 5.036286e-26
r-value: 0.641997
standard error: 0.083571
num_points: 212

a =  0.024647340585237255
k =  0.8541428920820173
p-value: 1.186951e-14
r-value: 0.571653
standard error: 0.099767
num_points: 153

a =  9.198275724464118e-05
k =  1.2590787495197502
p-value: 1.320476e-05
r-value: 0.765291
standard error: 0.225781
num_points: 24

a =  0.010261389553868317
k =  0.9040575053393242
p-value: 5.337854e-17
r-value: 0.660434
standard error: 0.092680
num_points: 125

a =  0.007836455982409087
k =  0.9045304900931068
p-value: 4.767082e-08
r-value: 0.698563
standard error: 0.138118
num_points: 47

a =  0.00029844862524387765
k =  1.1028099912192952
p-value: 1.710327e-03
r-value: 0.504334
standard error: 0.323824
num_points: 36


colors_chart = ["#A6BDD7", "#FFB300", "#803E75", "#FF6800", "#C10020", "#007D34", "#F6768E", "#CEA262", "#817066"]

all_games = joined.loc[joined['game_name'] != 'N/A']

fig, ax = plt.subplots(figsize=(15, 10))
ax.plot(all_games["viewcount"], all_games["val_superchats"], 'o', color = colors_chart[0], alpha=0.5, label="Other games")
ax.set_yscale('log')
ax.set_xscale('log')

ax.set_xlabel("View Count")
ax.set_ylabel("Superchat Earnings (USD)")
ax.set_title("View Count vs Superchat Earnings (GAMES ONLY / double log scale)")
ind = 1
for game in popular_games:
    cur_game = joined.loc[joined['game_name'] == game]
    ax.plot(cur_game["viewcount"], cur_game["val_superchats"], 'o', color = colors_chart[ind], label=game)
    ax.plot(x, (math.e**regs[ind-1][0])*x**regs[ind-1][1], color=colors_chart[ind]);
    ind += 1
    
handles, labels = ax.get_legend_handles_labels()
handles = handles[1:] + [handles[0]]
labels = labels[1:] + [labels[0]]



ax.legend(handles, labels, loc='upper left');
plt.show()


import sklearn.linear_model

joined.head()
to_ml = joined[["video_length","val_superchats", "viewcount", "is_gaming", "vtuber_ordinal", "affiliation_ordinal"]]

# normalize the viewcount and val_superchats
to_ml = to_ml[to_ml['val_superchats']>0]
to_ml['viewcount'] = to_ml['viewcount'].apply(math.log)
to_ml['val_superchats'] = to_ml['val_superchats'].apply(math.log)

# we don't need to scale ordinal or categorical variables in a linear regression

# the value we want to predict
y = to_ml['val_superchats']

# everything else
X = to_ml.drop('val_superchats',axis=1)
X.head()

y

0       3.444930
1       6.736678
2       6.535469
3       4.568066
4       2.706048
          ...   
2218    4.250601
2219    8.550701
2220    3.264271
2221    4.575987
2222    4.397564
Name: val_superchats, Length: 2157, dtype: float64


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# stratify our training data, so that we train on near-equal amounts of each vtuber
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X['vtuber_ordinal'], random_state = 2)

# use training data to fit regression
linreg = sklearn.linear_model.LinearRegression()
linreg.fit(X_train, y_train)

print(linreg.coef_)
print(linreg.intercept_)

# predict data on our testing and training data
y_test_reg = linreg.predict(X_test)
y_train_reg = linreg.predict(X_train)

# check accuracy
accuracy_train = linreg.score(X_train, y_train)
accuracy_test = linreg.score(X_test, y_test)


print("Regression r^2 on training data: %f"%accuracy_train)
print("Regression r^2 on test data: %f"%accuracy_test)

[ 0.00328273  0.31500079 -0.78465816  0.02406445  0.14428761]
1.1508926086303957
Regression r^2 on training data: 0.355851
Regression r^2 on test data: 0.351916


y_hat = linreg.predict(X)
sns.displot(y - y_hat)
plt.title("Residuals");


X_2 = to_ml.drop('val_superchats',axis=1).drop('viewcount',axis=1)
X_2.head()


# stratify our training data, so that we train on near-equal amounts of each vtuber
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y, stratify=X_2['vtuber_ordinal'], random_state = 2)

# use training data to fit regression
linreg_2 = sklearn.linear_model.LinearRegression()
linreg_2.fit(X_train_2, y_train_2)

print(linreg_2.coef_)
print(linreg_2.intercept_)

# predict data on our testing and training data
y_test_reg_2 = linreg_2.predict(X_test_2)
y_train_reg_2 = linreg_2.predict(X_train_2)

# check accuracy
accuracy_train_2 = linreg_2.score(X_train_2, y_train_2)
accuracy_test_2 = linreg_2.score(X_test_2, y_test_2)


print("Regression r^2 on training data: %f"%accuracy_train_2)
print("Regression r^2 on test data: %f"%accuracy_test_2)

[ 0.00373746 -0.82975032  1.89499113  0.14987797]
4.2684802213273265
Regression r^2 on training data: 0.337681
Regression r^2 on test data: 0.335115


y_hat_2 = linreg_2.predict(X_2)
sns.displot(y - y_hat_2)
plt.title("Residuals");

Field	Description	Ended up being used
`video_name`	The title of the video
`video_id`	Youtube's id for the video	✓
`description`	The description of the video, as written by the Vtuber
`published_at`	Youtube's id for the video	✓
`video_start_time`	The time at which the livestream actually started	✓
`video_end_time`	The time at which the livestream actually ended	✓
`num_superchats`	The amount of individual superchats received in a video	✓
`val_superchats`	The total value of superchats received in a video, in USD	✓
`locale`	Supposedly the language in which the video was made, but seems inaccurate
`viewcount`	The number of views the video received.¹	✓
`tags`	The tags that YouTube has assigned the video (gaming, entertainment, etc)	✓
`timestamps`	Timestamps for each individual superchat in the video	✓

	vtuber_name	affiliation	channel_id
0	shirakami_fubuki	hololive	UCdn5BQ06XqgXoAxIhbqw5Rg
1	tokino_sora	hololive	UCp6993wxpyDPHUpavwDFqgg
2	haachama	hololive	UC1CfXB_kRs3C-zaeTG3oGyg
3	natsuiro_matsuri	hololive	UCQ0UDLQCjY0rmuxCDE38FGg
4	minato_aqua	hololive	UC1opHUrw8rvnsadT-iGp7Cg

	channel_name	channel_id	video_name	video_id	description	published_at	video_start_time	video_end_time	num_superchats	val_superchats	locale	viewcount	tags	timestamps
0	shirakami_fubuki	UCdn5BQ06XqgXoAxIhbqw5Rg	【#FOXDEMON】クシャルダオラの時間だゴラァ！！！【ホロライブプロダクション/白上フブ...	14YQQoswvu0	#FOXDEMON　モンハンコラボです\n\n荒咬オウガ\nYoutube:　@Oga Ch...	2021-04-28T15:41:51Z	2021-04-28T13:00:12Z	2021-04-28T15:21:21Z	8	31.341077	ja	78525	[Action_game, Role-playing_video_game, Strateg...	[1619615216872040, 1619616211053572, 161961658...
1	shirakami_fubuki	UCdn5BQ06XqgXoAxIhbqw5Rg	【SONG】夕方の貴重な歌枠。【ホロライブ/白上フブキ】	-p8DUBvsMwc	ホロライブ所属の白上フブキです\nお久しぶりに歌っていきます✨✨\n\n歌っちゃ王\nhtt...	2021-04-28T11:20:11Z	2021-04-28T08:57:39Z	2021-04-28T11:09:31Z	117	842.756046	ja	165992	[Entertainment, Film]	[1619599977092673, 1619600425388284, 161960046...
2	shirakami_fubuki	UCdn5BQ06XqgXoAxIhbqw5Rg	【#バカタレ共】おきろ！！！！狩りの時間だ！！！【ホロライブ/白上フブキ/角巻わため/不知火...	2XH7EkqTWI8	バカタレ共です。\nモンハンアプデ！！！\n\n■不知火フレア\n@Flare Ch. 不知...	2021-04-28T03:15:29Z	2021-04-28T00:00:49Z	2021-04-28T02:52:30Z	28	689.156752	ja	185481	[Action_game, Role-playing_video_game, Strateg...	[1619562444007195, 1619568163965811, 161956822...
3	shirakami_fubuki	UCdn5BQ06XqgXoAxIhbqw5Rg	【#バカタレ共】モンスターハンタースペシャルプログラム同時視聴!!【ホロライブ/白上フブ...	v3IYT5p79Fg	バカタレ共です。\n\n「モンスターハンタースペシャルプログラム 2021.4.27」\n...	2021-04-27T14:25:41Z	2021-04-27T13:46:33Z	2021-04-27T14:20:20Z	14	96.357596	ja	114674	[Entertainment, Film]	[1619531417603994, 1619531478684038, 161953155...
4	shirakami_fubuki	UCdn5BQ06XqgXoAxIhbqw5Rg	【#from1st 】開催まであと１ヶ月!!見どころや未公開情報も解禁!!【ホロライブ１期生】	hKdM0ldOJlE	ホロライブ1期生の3周年記念ライブ「from 1st」\n2021年 5月 28日（金）\n...	2021-04-27T12:03:13Z	2021-04-27T10:58:11Z	2021-04-27T11:57:47Z	3	14.970000	ja	75277	[Entertainment, Film]	[1619521323777833, 1619521334455418, 161952346...

	vtuber_name	affiliation	channel_id	subs_count	views_count	videos_count	country_loc
0	deisu_aruran	holostars	UCKeAhJvy8zgXWbh9duVjIaQ	98400	3140315	610	JP
1	peanut_kun	independent	UCmgWMQkenFc72QnYkdxdoKA	111000	22847740	266	JP
2	kageyama_shien	holostars	UChSvpZYRPh0FvG4SJGSga3g	112000	3388583	258	JP
3	tomoshika_hikasa	voms	UC3vzVK_N_SUVKqbX69L_X4g	142000	6108838	201	JP
4	yukoku_roberu	holostars	UCANDOlYTJT7N5jlRC3zfzVA	166000	7847131	719	JP

	channel_name	channel_id	video_name	video_id	description	published_at	video_start_time	video_end_time	num_superchats	val_superchats	locale	viewcount	tags	timestamps	game_name
0	shirakami_fubuki	UCdn5BQ06XqgXoAxIhbqw5Rg	【#FOXDEMON】クシャルダオラの時間だゴラァ！！！【ホロライブプロダクション/白上フブ...	14YQQoswvu0	#FOXDEMON　モンハンコラボです\n\n荒咬オウガ\nYoutube:　@Oga Ch...	2021-04-28T15:41:51Z	2021-04-28T13:00:12Z	2021-04-28T15:21:21Z	8	31.341077	ja	78525	[Action_game, Role-playing_video_game, Strateg...	[1619615216872040, 1619616211053572, 161961658...	Monster Hunter Rise
1	shirakami_fubuki	UCdn5BQ06XqgXoAxIhbqw5Rg	【SONG】夕方の貴重な歌枠。【ホロライブ/白上フブキ】	-p8DUBvsMwc	ホロライブ所属の白上フブキです\nお久しぶりに歌っていきます✨✨\n\n歌っちゃ王\nhtt...	2021-04-28T11:20:11Z	2021-04-28T08:57:39Z	2021-04-28T11:09:31Z	117	842.756046	ja	165992	[Entertainment, Film]	[1619599977092673, 1619600425388284, 161960046...	N/A
2	shirakami_fubuki	UCdn5BQ06XqgXoAxIhbqw5Rg	【#バカタレ共】おきろ！！！！狩りの時間だ！！！【ホロライブ/白上フブキ/角巻わため/不知火...	2XH7EkqTWI8	バカタレ共です。\nモンハンアプデ！！！\n\n■不知火フレア\n@Flare Ch. 不知...	2021-04-28T03:15:29Z	2021-04-28T00:00:49Z	2021-04-28T02:52:30Z	28	689.156752	ja	185481	[Action_game, Role-playing_video_game, Strateg...	[1619562444007195, 1619568163965811, 161956822...	Monster Hunter
3	shirakami_fubuki	UCdn5BQ06XqgXoAxIhbqw5Rg	【#バカタレ共】モンスターハンタースペシャルプログラム同時視聴!!【ホロライブ/白上フブ...	v3IYT5p79Fg	バカタレ共です。\n\n「モンスターハンタースペシャルプログラム 2021.4.27」\n...	2021-04-27T14:25:41Z	2021-04-27T13:46:33Z	2021-04-27T14:20:20Z	14	96.357596	ja	114674	[Entertainment, Film]	[1619531417603994, 1619531478684038, 161953155...	N/A
4	shirakami_fubuki	UCdn5BQ06XqgXoAxIhbqw5Rg	【#from1st 】開催まであと１ヶ月!!見どころや未公開情報も解禁!!【ホロライブ１期生】	hKdM0ldOJlE	ホロライブ1期生の3周年記念ライブ「from 1st」\n2021年 5月 28日（金）\n...	2021-04-27T12:03:13Z	2021-04-27T10:58:11Z	2021-04-27T11:57:47Z	3	14.970000	ja	75277	[Entertainment, Film]	[1619521323777833, 1619521334455418, 161952346...	N/A

Vtuber Superchat Data Analysis - A Data Science Pipeline Tutorial¶

Table of Contents¶

Introduction ¶

What are Vtubers?¶

Why care?¶

Data Collection ¶

After waiting the whole day and night...¶

Data Collection, Continued - Web Scraping ¶

Data Management + Representation ¶

Exploratory Data Analysis ¶

Correlation between time of day and superchats ¶

Time of Day and Superchats: Summary ¶

Possible Future Work ¶

Relationship between view count and superchat earnings ¶

The following regression is not what we want. It has been left as a demonstration.¶

Warning! The following section describes a pitfall that you should avoid.¶

Back to regularly scheduled data science¶

Video game streams - better or worse?¶

Views vs Superchats Findings, summarized ¶

Machine Learning ¶

Takeaways ¶

Future Possibilities ¶

Vtubers list¶

	video_length	viewcount	is_gaming	vtuber_ordinal	affiliation_ordinal
0	141.150000	11.271172	1	45	6
1	131.866667	12.019695	0	45	6
2	171.683333	12.130708	1	45	6
3	33.783333	11.649849	0	45	6
4	59.600000	11.228930	0	45	6

Name	Affiliation
A.I.Channel	independent
Gawr Gura Ch. hololive-EN	hololive_en
Korone Ch. 戌神ころね	hololive
Pekora Ch. 兎田ぺこら	hololive
フブキCh。白上フブキ	hololive
Mori Calliope Ch. hololive-EN	hololive_en
Marine Ch. 宝鐘マリン	hololive
Aqua Ch. 湊あくあ	hololive
Watson Amelia Ch. hololive-EN	hololive_en
Rushia Ch. 潤羽るしあ	hololive
HAACHAMA Ch. 赤井はあと	hololive
Coco Ch. 桐生ココ	hololive
Noel Ch. 白銀ノエル	hololive
Okayu Ch. 猫又おかゆ	hololive
Matsuri Channel 夏色まつり	hololive
Takanashi Kiara Ch. hololive-EN	hololive_en
Ninomae Ina'nis Ch. hololive-EN	hololive_en
Suisei Channel	hololive
Subaru Ch. 大空スバル	hololive
Watame Ch. 角巻わため	hololive
Kanata Ch. 天音かなた	hololive
Botan Ch.獅白ぼたん	hololive
SoraCh. ときのそらチャンネル	hololive
月ノ美兎	nijisanji
Mio Channel 大神ミオ	hololive
Moona Hoshinova hololive-ID	hololive_id
本間ひまわり - Himawari Honma -	nijisanji
犬山たまき / 佃煮のりおチャンネル	independent
Towa Ch. 常闇トワ	hololive
Nene Ch.桃鈴ねね	hololive
Kureiji Ollie Ch. hololive-ID	hololive_id
鈴原るる【にじさんじ所属】	nijisanji
リゼ・ヘルエスタ -Lize Helesta-	nijisanji
アルス・アルマル -ars almal- 【にじさんじ】	nijisanji
戌亥とこ -Inui Toko-	nijisanji
Ayunda Risu Ch. hololive-ID	hololive_id
竜胆尊 / Rindou Mikoto	nijisanji
天野ピカミィ. Pikamee	voms
Pavolia Reine Ch. hololive-ID	hololive_id
Airani Iofifteen Channel hololive-ID	hololive_id
鷹宮リオン / Rion Takamiya	nijisanji
夢月ロア🌖Yuzuki Roa	nijisanji
Anya Melfissa Ch. hololive-ID	hololive_id
魔界ノりりむ	nijisanji
ぽちまる:POCHI-GOYA channel	independent
Roberu Ch. 夕刻ロベル	holostars
緋笠トモシカ - Tomoshika Hikasa -	voms
Shien Ch.影山シエン	holostars
ピーナッツくん!オシャレになりたい!	independent
Aruran Ch. アルランディス	holostars

Vtuber Superchat Data Analysis - A Data Science Pipeline Tutorial¶

Table of Contents¶

Introduction¶

What are Vtubers?¶

Why care?¶

Data Collection¶

After waiting the whole day and night...¶

Data Collection, Continued - Web Scraping¶

Data Management + Representation ¶

Exploratory Data Analysis¶

Correlation between time of day and superchats¶

Time of Day and Superchats: Summary¶

Possible Future Work¶

Relationship between view count and superchat earnings¶

The following regression is not what we want. It has been left as a demonstration.¶

Warning! The following section describes a pitfall that you should avoid.¶

Back to regularly scheduled data science¶

Video game streams - better or worse?¶

Views vs Superchats Findings, summarized¶

Machine Learning¶

Takeaways¶

Future Possibilities¶

Vtubers list¶

Introduction ¶

Data Collection ¶

Data Collection, Continued - Web Scraping ¶

Exploratory Data Analysis ¶

Correlation between time of day and superchats ¶

Time of Day and Superchats: Summary ¶

Possible Future Work ¶

Relationship between view count and superchat earnings ¶

Views vs Superchats Findings, summarized ¶

Machine Learning ¶

Takeaways ¶

Future Possibilities ¶