前言

kaggle在23天之前发布了2021年关于kagglers的调查问卷情况，本文对此进行了一些可视化与分析，本文是借鉴了一些kaggle上金银牌的notebook基础上完成的。

如果想要数据以及本文的源代码文件的话，我已经放到百度云盘了，链接：https://pan.baidu.com/s/1O36QKswJT1hOSfdUQKC3dA 提取码：9ss7。(如果对你有帮助的话，点个赞呗)

当然，如果时间和条件允许的话，还是推荐去kaggle官网学习(https://www.kaggle.com/c/kaggle-survey-2021/overview)

导入需要使用的一些包:

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os
import plotly.express as px
import plotly.graph_objects as go
# 为了避免一些不必要的warning，直接将warning过滤掉
import warnings
warnings.filterwarnings('ignore')

这里用到了一个新的(对于我来说)画图工具plotly，我觉得plotly画出来的图挺好看的，后面可能会继续学习一下，学习地址(需要翻墙)：https://plotly.com/python/。

加载数据：

# 读取数据
# skiprows=1 表示读取的时候跳过第一行，因为第一行我们不需要
df = pd.read_csv('D:/jupyter/kaggle/kaggle2021survey/kaggle_survey_2021_responses.csv',skiprows=1)
# 简单查看一下数据
df.head()

# 为了显示方便，将一些国家的名字改成缩写
df[df.columns[3]].replace({'United Kingdom of Great Britain and Northern Ireland':'UK',
                           'Iran, Islamic Republic of...':'Iran',
                           'United Arab Emirates':'UAE',
                           'United States of America':'USA',
                           'Viet Nam':'Vietnam'}, inplace=True) # inplace=True 表示直接对源数据进行更改

1.参与者的背景

1.1 国家

# 打印有哪些国家的kagglers参与了问卷调查
print(f"List of countries:\n{np.sort(df[df.columns[3]].unique())}\n")

呕吼，Hong Kong和Taiwan。

import plotly.offline as py
py.offline.init_notebook_mode()
def plot_in_map(locations,counts,title):
    data = [ dict(
            type = 'choropleth',
            locations = locations,
            z = counts,
            locationmode = 'country names',
            autocolorscale = True,
            marker = dict(
                line = dict(color = 'rgb(58,100,69)', width = 0.6)),
                colorbar = dict(autotick = True, tickprefix = '', title = '人数')
                )
           ]
    layout = dict(
        title = title,
        geo = dict(
            showframe = True,
            showcoastlines = True,
            showocean = True,
            showlakes = True,
            oceancolor = '#00008B',
            projection = dict(
            type = 'robinson'
            ),
        margin = dict(b = 0, t = 0, l = 0, r = 0,pad= 5)
                ),
        autosize = True
        )

    fig = dict(data=data, layout=layout)
    
    py.iplot(fig, validate=False, filename='world-map')
z = df[df.columns[3]].value_counts()
plot_in_map(locations=z.index,
            counts=z.values,
            title='参与问卷的kagglers的国家分布情况')

从图中可以看到，印度的kagglers最多，但是我总感觉kaggle上中国人不应该这么少，至少从每个比赛的LeaderBoard来看，中国人应该是挺多的。

1.2 年龄

fig = px.pie(df, df.columns[1], title='kagglers的年龄分布情况', hole=0.6)
fig.update_traces(textinfo='percent+label')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')
fig.show()

可以发现年龄小于30岁的占了56%。

再来看一下看一下中国的kagglers年龄分布情况：

country = 'China'

title = f"{country} 的kagglers年龄分布情况"
fig = px.pie(df[df[df.columns[3]]==country], df.columns[1], title=title, hole=0.6)

# 更新显示样式，默认的是只显示percent
fig.update_traces(textinfo='percent+label')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')
fig.show()

# 求出年龄小于30岁的kagglers的比例
age_pct = len(df[(df[df.columns[3]]==country) & (df[df.columns[1]].isin(['18-21','22-24','25-29']))])*100/len(df[df[df.columns[3]]==country])
if age_pct < 56:
    print(f"{country} 的kagglers是相对比较年老的")
else:
    print(f"{country} 的kagglers是相对比较年轻的")

中国的kagglers年龄小于30岁的占了78.6%，可以看出，中国的kagglers大多数都是年轻人。

df_country = df[df.country_agg==country][df.columns[1]].value_counts(normalize=True).sort_index()
df_others = df[df.country_agg=='Others'][df.columns[1]].value_counts(normalize=True).sort_index()
# 保证China和Others的index一致，如果缺少了就补上
for index in df_others.index:
    if index not in df_country.index:
        df_country[index] = 0
df_country.sort_index(inplace=True)
fig = go.Figure(data=[
    go.Bar(name='Others', x=df_others.index, y=df_others.values*100),
    go.Bar(name=country, x=df_others.index, y=df_country.values*100)  
])
# 改变柱状图的样式
fig.update_layout(
    barmode='group',
    title=f'中国的kagglers年龄分布情况与其他国家对比',
    xaxis_title='Age',
    yaxis_title='Percentage of respondents',
)
fig.show()

再看一下每个国家kagglers的平均年龄与整体平均年龄差异情况：

# 原始数据只给出了像“18-21”这样的年龄段，这里为了求平均年龄，将年龄段改为了年龄
# 做法是将年龄段的两个端点平均，生成新的一列：age
df['age1'] = df.iloc[:,1].str.split('-').str[0]
df['age1'].replace('70+','70', inplace=True)
df['age2'] = df.iloc[:,1].str.split('-').str[1]
df['age1'] = df.age1.astype('int')
df.age2.fillna(70, inplace=True)
df['age2'] = df.age2.astype('int')
df['age'] = (df.age1+df.age2)/2

'''
numpy中where函数的使用：
        np.where(condition, x, y)
        满足条件(condition)，输出x，不满足输出y。
'''
# 生成新的一列，将国家分为'China'和'Others'
df['country_agg'] = np.where(df[df.columns[3]]==country,country,'Others')
global_average = df.age.mean()
country_average = df[df.country_agg==country].age.mean()

if country_average <= global_average:
    title = f"With an average age of {country_average:.0f},<br>Kagglers from {country} are generally {global_average - country_average:.0f} years younger than the average Kaggler"
else:
    title = f"With an average age of {country_average:.0f},<br>Kagglers from {country} are generally {country_average - global_average:.0f} years younger than the average Kaggler"

# gruopby将数据按国家进行分组
# 按照每个国家平均年龄的大小对国家进行排序，得到一个国家名字的列表，然后把China单独挑出来
loc = df.groupby(df.columns[3]).age.mean().sort_values(ascending=False).index.to_list().index(country)

# 给其他国家的数据设置成相同的颜色，China的设置为橙色
color = ['#636EFA']*len(df.groupby(df.columns[3]).age.mean().sort_values(ascending=False).index)
color[loc] = 'orange'

fig = go.Figure(data=[go.Bar(x=df.groupby(df.columns[3]).age.mean().sort_values(ascending=False).index
       , y=df.groupby(df.columns[3]).age.mean().sort_values(ascending=False)
            , marker_color=color)])

# 画出整体平均年龄的一条线，是通过指定两个点来画直线的
fig.update_layout(
    shapes=[
    dict(
      type= 'line',
      yref= 'y', y0= global_average, y1= global_average,
      xref= 'x', x0= -0.5, x1=len(df.groupby(df.columns[3]).age)-0.5
    )],
    title=title,
    xaxis_title=None,
    yaxis_title='Age')

# 设置“Global Average”的字体显示位置
fig.add_annotation(x=len(df.groupby(df.columns[3]).age)*0.95, y=global_average, xshift=-20, yshift=10,
            text="Global Average",
            showarrow=False)
fig.show()

1.3 性别

# 看一下性别组成
df[df.columns[2]].value_counts()

这里发现除了男和女，还有其他不同的回答，我们将除了男女的回答统一设置为Others_Gender。

df[df.columns[2]].replace(
{
    'Prefer not to say':'Others_Gender',
    'Nonbinary':'Others_Gender',
    'Prefer to self-describe':'Others_Gender'
}
,inplace=True)
# 以饼状图的形式给出kagglers的性别组成
fig = px.pie(df, df.columns[2], title=f"所有kagglers的性别分布", hole=0.6)
fig.update_traces(textinfo='percent+label')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')
fig.show()

再看一下中国kagglers的性别组成与其他国家对比：

df_country_gender = df[df.country_agg==country].iloc[:,2].value_counts(normalize=True).sort_values(ascending=False)
df_other_platform = df[df.country_agg=='Others'].iloc[:,2].value_counts(normalize=True).sort_values(ascending=False)

fig = go.Figure(data=[
    go.Bar(name='Others', x=df_other_platform.index, y=df_other_platform.values*100),
    go.Bar(name=country, x=df_country_gender.index, y=df_country_gender.values*100)
])
# 改变样式
fig.update_layout(
    title=f'中国Kagglers的性别组成与其他国家对比',
    xaxis_title=None,
    yaxis_title='Percentage',  
)
fig.show()

从性别上来看，中国的kagglers与其他国家还是比较一致的。

1.4 学历(现在的或者未来两年将获得的)

fig = px.pie(df, df.columns[4], 
             title="kagglers的学历分布情况", hole=0.6)
fig.update_traces(textinfo='percent+label')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide', showlegend=False)
fig.show()

再看一下中国kagglers的学历情况：

fig = px.pie(df[df.iloc[:,3]==country], df.columns[4], title=f"中国kagglers的学历分布", hole=0.6)
fig.update_traces(textinfo='percent+label')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide', showlegend=False)
fig.show()

可以发现中国kagglers有近一半都是硕士，我已经被卷没了，生存堪忧啊。

1.5 职业

fig = px.pie(df, df.columns[5], title='kagglers的职业分布情况', hole=0.6)
fig.update_traces(textinfo='percent+label')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide', showlegend=False)
fig.show()

再看一下中国kagglers的职业情况：

fig = px.pie(df[df.iloc[:,3]==country], df.columns[5], title=f"中国的kagglers的职业分布情况", hole=0.6)
fig.update_traces(textinfo='percent+label')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide', showlegend=False)
fig.show()

学生占了近一半比例，这与其他国家相比着实有点高，看来中国学生很多都对机器学习和数据分析感兴趣呀，我已经想到我毕业时候严峻的就业形式了。

1.6 编程经验

df[df.columns[6]].value_counts()

df[df.columns[6]].replace("I have never written code","0 years",inplace=True)
df_country_ProExp = df[df[df.columns[3]]==country][df.columns[6]].value_counts(normalize=True).sort_values()
df_others_ProExp = df[df[df.columns[3]]!=country][df.columns[6]].value_counts(normalize=True).sort_values()

fig = go.Figure(data = [go.Bar(name="others", x = df_others_ProExp.values*100, y = df_country_ProExp.index, orientation='h'),
                        go.Bar(name="China", x = df_country_ProExp.values*100, y = df_others_ProExp.index, orientation='h')
                       ])
fig.update_layout(
    title=f'中国的kagglers的编程经验与其他国家对比',
    xaxis_title='Percentage',
    yaxis_title='Programming Experience',
)
fig.show()

可以发现中国kagglers的编程经验大多为1-3年，这与中国kagglers的职业大多是学生也是相对应的，学生的编程经验应该大多为1-3年。

1.7 所在公司的规模

df[df.columns[116]].value_counts().sort_values(ascending=False)

fig = go.Figure(data = [go.Bar(x = df[df.columns[116]].value_counts().sort_values(ascending=False).index, 
                               y = df[df.columns[116]].value_counts().sort_values(ascending=False).values)
                       ])
fig.update_layout(
    title=f'kagglers所在的公司规模情况',
    xaxis_title='公司规模',
    yaxis_title='人数',
)
fig.show()

可以发现，0-49人的小公司kagglers最多，是因为小公司比较闲吗，还没开始工作，不太理解。

1.8 年薪(美元)

fig = go.Figure(data = [go.Bar(x = df[df.columns[127]].value_counts().sort_values(ascending=False).index, 
                               y = df[df.columns[127]].value_counts().sort_values(ascending=False).values)
                       ])
fig.update_layout(
    title=f'kagglers的年薪情况',
    xaxis_title='年薪',
    yaxis_title='人数',
)
fig.show()

年薪$0-999（人民币0-6391.602元）的kagglers最多，看到这我又仔细看了一下数据，是年薪，没错，后来一想，因为很多kagglers是学生，这年薪确实正常。

2.各种工具以及算法偏好情况

2.1 编程语言

dict1 = dict(df[df.columns[7]].value_counts())
for i in range(0,12):
    dict1 = dict(dict1,**dict(df[df.columns[i+8]].value_counts()))
series_ProLan = pd.Series(dict1).sort_values(ascending=False)
fig = go.Figure(go.Bar(x = series_ProLan.index, y = series_ProLan.values))
fig.update_layout(
    title=f'kagglers使用的编程语言偏好情况',
    xaxis_title='编程语言',
    yaxis_title='人数',
)
fig.show()

果然，做数据分析和机器学习的还是用python的最多，当然数据分析离不开数据库，所以SQL排在了第二位。

2.2 集成开发环境(IDE)

dict2 = dict(df[df.columns[21]].value_counts())
for i in range(0,12):
    dict2 = dict(dict2,**dict(df[df.columns[i+22]].value_counts()))
series_IDE = pd.Series(dict2).sort_values(ascending=False)
fig = go.Figure(go.Bar(x = series_IDE.index, y = series_IDE.values))
fig.update_layout(
    title=f'kagglers使用的IDE偏好情况',
    yaxis_title='人数',
)
fig.show()

IDE偏好我感觉不仅仅是kagglers，只要是做机器学习相关的，用的IDE大多都是Jupyter Notebook、VSCode和PyCharm吧，至少我身边的人是这样。

2.3 notebook

dict3 = dict(df[df.columns[34]].value_counts())
for i in range(0,16):
    dict3 = dict(dict3,**dict(df[df.columns[i+35]].value_counts()))
series_notebook = pd.Series(dict3).sort_values(ascending=False)
fig = go.Figure(go.Bar(x = series_notebook.index, y = series_notebook.values))
fig.update_layout(
    title=f'kagglers使用的notebook偏好情况',
    yaxis_title='人数',
)
fig.show()

作为一个新手kaggler，目前这些notebook我都没用过……

2.4 计算平台

dict4 = dict(df[df.columns[52]].value_counts())
for i in range(0,5):
    dict4 = dict(dict4,**dict(df[df.columns[i+53]].value_counts()))
series_Compute = pd.Series(dict4).sort_values(ascending=False)
fig = go.Figure(go.Bar(x = series_Compute.index, y = series_Compute.values))
fig.update_layout(
    title=f'kagglers使用的计算平台情况',
    yaxis_title='人数',
)
fig.show()

可以发现，大部分kagglers是没有使用GPU、TPU计算资源的。

2.5 可视化工具

dict5 = dict(df[df.columns[59]].value_counts())
for i in range(0,11):
    dict5 = dict(dict5,**dict(df[df.columns[i+60]].value_counts()))
series_visual = pd.Series(dict5).sort_values(ascending=False)

fig = go.Figure(data=[go.Pie(labels=list(series_visual.index), values=list(series_visual.values))])
fig.update_layout(
    title=f'可视化工具的使用偏好情况'
)
fig.show()

还是Matplotlib和Seaborn使用的最多，Plotly排在第三位。

2.6 机器学习框架

dict6 = dict(df[df.columns[72]].value_counts())
for i in range(0,17):
    dict6 = dict(dict6,**dict(df[df.columns[i+73]].value_counts()))
series_MLFrame = pd.Series(dict6).sort_values(ascending=False)
fig = go.Figure(go.Bar(x = series_MLFrame.index, y = series_MLFrame.values))
fig.update_layout(
    title=f'机器学习框架的使用偏好情况',
    yaxis_title='人数',
)
fig.show()

2.7 机器学习算法

dict7 = dict(df[df.columns[90]].value_counts())
for i in range(0,11):
    dict7 = dict(dict7,**dict(df[df.columns[i+91]].value_counts()))
series_algorithms = pd.Series(dict7).sort_values(ascending=False)
fig = go.Figure(go.Bar(x = series_algorithms.index, y = series_algorithms.values))
fig.update_layout(
    title=f'机器学习算法的使用情况',
    yaxis_title='人数',
)
fig.show()

机器学习算法最常用的算法是线性回归和逻辑回归，其次是决策树和随机森林，再其次就是我目前正在学习的梯度提升的方法，后面会继续更新XGBoost、LightGBM等算法的学习笔记。

2.8 计算机视觉方法

dict8 = dict(df[df.columns[102]].value_counts())
for i in range(0,6):
    dict8 = dict(dict8,**dict(df[df.columns[i+103]].value_counts()))
series_temp = pd.Series(dict8).sort_values(ascending=False)
series_temp.index

series_cv = pd.Series(series_temp.values, index=["VGG, Inception, ResNet, ResNeXt, NASNet, EfficientNet, etc","U-Net, Mask R-CNN, etc",
                                                "YOLOv3, RetinaNet, etc","PIL, cv2, skimage, etc","GAN, VAE, etc","None","Other"])
                                                fig = go.Figure(go.Bar(x = series_cv.index, y = series_cv.values))
fig.update_layout(
    title=f'计算机视觉方法的使用情况',
    yaxis_title='人数',
)
fig.show()

数据和本文源代码获取地址：https://pan.baidu.com/s/1O36QKswJT1hOSfdUQKC3dA 提取码：9ss7。(如果对你有帮助的话，点个赞呗)