Data Science — Visualization資料科學程式馬拉松

D19 Python 資料視覺化工具與常見統計圖表介紹

Python 資料視覺化主要套件

Matplotlib 操作方式

import matplotlib.pyplot as plt

Matpltlib:常用參數說明

matplotlib.pyplot.plot(*args,scalex=True,scaley=True,data=None,**kwargs)
import numpy as np
import matplotlib.pyplot as plt
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.plot(x, y,)
import numpy as np
import matplotlib.pyplot as plt
x = np.arange(10)plt.plot(x,x*1.5,'go:',x,x*2.5,'rx',x,x*3.5,'^',x,x*4.5,'bd-.')

Matpltlib:figure 參數說明

matpltlib.pyplot.figure(num = None, figsize=None, dpi=None,facecolor=None,edgecolor=None,
frameon=True,FigureClass=<class 'matplotlib.figure.Figure'>,clear=False,
**kwargs)
import matplotlib.pyplot as pltx = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.figure(figsize=(8,6),
facecolor='c')
plt.plot(x,y)
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Plot with figsize (8,6)")
plt.show()

Matpltlib:subplot 參數說明

matpltlib.pyplot.subplot(nrows, ncols, index, **kwargs)
import numpy as np
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2) # 建立 2*2 多維視窗
data = pd.Series(np.random.rand(5), index=list('12345'))
#ax為選擇畫圖視窗,color為顏色,alpha為透明度設定
data.plot.bar(ax=axes[0,1], color='b', alpha = 1)
data.plot.barh(ax=axes[1,1], color='b', alpha=0.5)
data.plot.bar(ax=axes[1,0], color='c', alpha = 0.8)
data.plot.barh(ax=axes[0,0], color='r', alpha=0.5)

plt.show()

範例說明:散點圖(Scatter Plots)

matplotlib.pyplot.scatter(x, y, s=None, c=None, marker=None, cmap=None, norm=None, vmin=None,
vmax=None, alpha=None, linewidths=None, verts=<deprecated parameter>,
edgecolors=None, *, plotnonfinite=False, data=None, **kwargs)
X = np.random.normal(0, 1, 100)
Y = np.random.normal(0, 1, 100)
plt.scatter(X, Y, color='b', alpha=0.5,s=100, edgecolors='red')
plt.title("Scatter plot ")

範例說明:長條圖(Bar Plots)

x = np.arange(0., 10., 0.7)
y = np.arange(0., 10., 0.7)
plt.bar(x, y)

長條圖 in pandas plot

重點整理

D20 使用 Matplotlib 繪製各種常用圖表

# 載入需要的...
import matplotlib.pyplot as plt
import numpy as np
# 準備數據 ... 假設我要畫一個sin波 從0~180度x = np.arange(0,180)
y = np.sin(x * np.pi / 180.0)
# 開始畫圖# 設定要畫的的x,y數據list....plt.plot(x,y)# 設定圖的範圍, 不設的話,系統會自行決定
plt.xlim(-30,390)
plt.ylim(-1.5,1.5)
# 照需要寫入x 軸和y軸的 label 以及title

plt.xlabel("x-axis")
plt.ylabel("y-axis")
plt.title("The Title")

# 在這個指令之前,都還在做畫圖的動作
# 這個指令算是 "秀圖"
plt.show()
import matplotlib.pyplot as pltfig = plt.figure(figsize=(10,6)) #設定 figure 的尺寸
ax1 = fig.add_subplot(3,1,1) #分別畫出三格圖象,都可以針對特定圖象編輯與繪圖
ax2 = fig.add_subplot(3,1,2)
ax3 = fig.add_subplot(3,1,3)
plt.show()
import numpy as np
import matplotlib.pyplot as plt
#設定 figure 的尺寸
fig = plt.figure(figsize=(10,6))
#設定 x , y 數值
x = np.arange(0, 3 * np.pi, 0.1)
y_sin = np.sin(x)
y_cos = np.cos(x)
#設定雙格畫板以及選特定畫板畫圖
plt.subplot(2, 1, 1)
plt.plot(x, y_sin)
#命名圖象
plt.title("Sine")
plt.subplot(2, 1, 2)
plt.plot(x, y_cos)
#命名圖象
plt.title("Cosine")
plt.show()

除了 PLT 之外的繪圖:figure 參數說明

matpltlib.pyplot.figure(
num = None,
figsize=None,
dpi=None,
facecolor=None,
edgecolor=None,
frameon=True,
FigureClass=<class 'matplotlib.figure.Figure'>,
clear=False,
**kwargs)

除了 PLT 之外的繪圖:plt.text 參數說明

製作繪圖板 Axes

#import matplotlib.pyplot as plt#決定最外框
plt.axes([0.1,0.1,.8,.8])
plt.xticks([]), plt.yticks([])
plt.text(0.6,0.6, 'axes([0.1,0.1,.8,.8])',ha='center',va='center',size=20,alpha=.5)
#決定內框
plt.axes([0.2,0.2,.3,.3])
plt.xticks([]), plt.yticks([])
plt.text(0.5,0.5, 'axes([0.2,0.2,.3,.3])',ha='center',va='center',size=16,alpha=.5)
plt.show()

製作 3D 繪圖板

#導入必要的模組
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# 創建一個3d坐標系
fig = plt.figure()
ax = Axes3D(fig)
#直接查詢參數與設定
#help(plt.plot)
#help(np.random.sample)
# 利用x軸和y軸繪製sin曲線
x = np.linspace(0, 1, 100) # linspace創建等差陣列
y = np.cos(x * 2 * np.pi) / 2 + 0.5
# 通過zdir = 'z' 將資料繪製在z軸,zs = 0.5 則是將資料繪製在z = 0.5的地方
ax.plot(x, y, zs = 0.5, zdir = 'z', color = 'black', label = 'curve in (x, y)')

D21 使用 Seaborn 進行資料視覺化

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# data 設置
data = np.array([3, 12, 5, 18, 45])
bars = np.array(['A', 'B', 'C', 'D', 'E'])
y_pos = np.arange(len(bars))
plt.yticks([3,5,12,18,45]) # y 軸的刻度值
sns.barplot(y_pos, data, color=(0.2, 0.8, 0.6, 0.6))#color 為 RGB 及 Alpha 調整
plt.legend(['b']) # 左上圖例添加# Custom Axis title 需要調整 matplotlib 設置 x , y座標
plt.xlabel('title of the xlabel', fontweight='bold', color = 'orange', fontsize='17', horizontalalignment='center');
plt.ylabel('title of the ylabel ',fontweight='bold', color = 'r', fontsize='10', horizontalalignment='center')
sns.set_style('darkgrid') # 設定圖形樣式
def sinplot(flip=1):
x = np.linspace(0, 14, 100)
for i in range(1, 7):
plt.plot(x, np.sin(x + i * .5) * (7 - i) * flip)
# sns.set()
sinplot()
def sinplot(flip=1):
x = np.linspace(0, 14, 100)
for i in range(1, 7):
plt.plot(x, np.sin(x + i * .5) * (7 - i) * flip)
sns.set_style("whitegrid")#加入sns.set_style(),並於括弧內加入欲選擇的主題,如:sns.set_style(whitegrid),
#則可得範例二的圖像。範例三、四、五則以此類推。
sinplot()

seaborn 視覺化線性關係

sns.set(style="whitegrid")
tips = sns.load_dataset("tips")
sns.lmplot(x="total_bill", y="tip", data= tips)
sns.set(style="whitegrid")
tips = sns.load_dataset("tips")
sns.lmplot(x="total_bill", y="tip", col='time', hue='smoker', data= tips)

seaborn 使用語義映射繪製數據子集

sns.set(style="whitegrid")
fmri = sns.load_dataset("fmri")
sns.relplot(x="timepoint", y="signal", kind='line', hue="event" , data=fmri)
sns.set(style="whitegrid")
fmri = sns.load_dataset("fmri")
sns.relplot(x="timepoint", y="signal", kind='line', hue="region" , style="event", data=fmri)
sns.set(style="whitegrid")
fmri = sns.load_dataset("fmri")
sns.relplot(x="timepoint", y="signal", hue="event", style="event",
kind="line", data=fmri)

seaborn 聚合和表示不確定性

sns.set(style="dark")
tips = sns.load_dataset("fmri") #載入數據集
tips.sample(50) #可檢視五十筆隨機資料
sns.set(style="whitegrid")
fmri = sns.load_dataset("fmri")
sns.relplot(x="timepoint", y="signal", kind='line',data=fmri)
sns.set(style="whitegrid")
fmri = sns.load_dataset("fmri")
sns.relplot(x="timepoint", y="signal", kind='line', ci="sd",
data=fmri)
sns.set(style="whitegrid")
fmri = sns.load_dataset("fmri")
sns.relplot(x="timepoint", y="signal", kind='line', ci= None,
data=fmri)

seaborn 視覺化統計關係

#定義主題風格
sns.set(style="darkgrid")
#載入tips
tips = sns.load_dataset("tips")
#繪製圖形,根據不同類型的三點設定圖
sns.relplot(x="total_bill", y="tip", hue="smoker", style="time", data=tips)
plt.show()
#可繪製有漸變效果的散點圖
sns.relplot(x="total_bill", y="tip", hue="size", palette="ch:r=-.5,l=.75", data=tips);
plt.show()
sns.set(color_codes=True)
mean, cov = [0, 1], [(1, .5), (.5, 1)] #設定兩組參數
x, y = np.random.multivariate_normal(mean, cov, 1000).T
with sns.axes_style("ticks"):
sns.jointplot(x=x, y=y, kind="hex", color="r")
plt.show()
flights = sns.load_dataset("flights") # 載入航班數據 
flights = flights.pivot("month", "year", "passengers") #修改數據排列
# 劃分每格單元的行寬度,使用不同的colormap,取消顏色條
sns.heatmap(flights, linewidths=1 , cmap="YlGnBu", cbar=False)
plt.show()

seaborn 視覺化數據集的分佈

import numpy as np 
import pandas as pd
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))
x = np.random.normal(size = 100) 
sns.distplot(x)
x = np.random.normal(size=100)
sns.distplot(x, kde=False, rug=True)
x = np.random.normal(size=100)
sns.distplot(x, bins=20 , kde=False, rug=True)
x = np.random.normal(size=100)
sns.distplot(x, hist=False, rug=True)
x = np.random.normal(0, 1, size=30)
bandwidth = 1.06 * x.std() * x.size ** (-1 / 5.)
support = np.linspace(-4, 4, 200)
kernels = []
for x_i in x:
kernel = stats.norm(x_i, bandwidth).pdf(support)
kernels.append(kernel)
plt.plot(support, kernel, color="r")
sns.rugplot(x, color=".2", linewidth=3)
from scipy.integrate import trapz
density = np.sum(kernels, axis=0)
density /= trapz(density, support)
plt.plot(support, density)
sns.kdeplot(x, shade=True)
import numpy as np 
import pandas as pd
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns
mean, cov = [0, 1], [(1, .5), (.5, 1)]
data = np.random.multivariate_normal(mean, cov, 200)
df = pd.DataFrame(data, columns=["x", "y"])
sns.jointplot(x="x", y="y", data=df)
x, y = np.random.multivariate_normal(mean, cov, 1000).T
with sns.axes_style("white"):
sns.jointplot(x=x, y=y, kind="hex", color="k");

D22 運用實際資料集進行資料視覺化練習

# 導入必要的程式庫
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
# 取得鳶尾花資料集
df = sns.load_dataset('iris')
df.info()
sns.boxplot(data = df, orient = "h")
sns.stripplot(x = "species", y = "petal_length", data = df)
sns.swarmplot(x = "species", y = "petal_length", data = df)
sns.boxplot(x = "species", y = "petal_length", data = df)
sns.swarmplot(x = "species", y = "petal_length", data = df)

核密度估計(Kernel Density Estimates, KDE)

sns.set_style("ticks")
sns.pairplot(df,hue = 'species',diag_kind = "kde",kind = "scatter",palette = "husl")
g = sns.pairplot(df,hue = 'species',diag_kind = "kde",kind = "scatter",palette = "husl")
g.map_upper(plt.scatter)
g.map_lower(sns.kdeplot, cmap = "Blues_d")
g.map_diag(sns.kdeplot, lw = 3, legend = False)

D23 結合 Pandas 與 Matplotlib 進行進階資料視覺化練習

df_white.duplicated().sum(),df_red.duplicated().sum()
df_white.drop_duplicates(inplace = True)
df_red.drop_duplicates(inplace = True)
red_df.isnull().sum().sum(), white_df.isnull().sum().sum()

什麼是視覺化?

直方圖

df_wine.hist(bins=10, color='lightblue',edgecolor='blue',xlabelsize=8, ylabelsize=8, grid=False)plt.tight_layout(rect=(1,1,3,3))

熱力圖:熱圖中的梯度根據屬性之間的相關性強度而變化。

f, ax = plt.subplots(figsize=(10, 6))
b = sns.heatmap(df_all.corr(), annot=True, linewidths=.05, ax=ax)
f.subplots_adjust(top=0.93)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
title= f.suptitle('Correlation Heatmap for wine attributes', fontsize=12)

聯合圖

sns.jointplot(data=df_wine, x="fixed acidity", y="volatile acidity", kind = 'reg')
sns.jointplot(data=df_wine, x="alcohol", y="chlorides", kind = 'reg')
sns.jointplot(data=df_wine, x="volatile acidity", y="citric acid", kind = 'reg')

小提琴圖

PYTHON 數據可視化

數據降維與可視化

D24 BOKEH 輕鬆以網頁呈現視覺化圖表

Bokeh

Bokeh 程式的基本運作

from bokeh.resources import INLINE
import bokeh.io
bokeh.io.output_notebook(INLINE)
from bokeh.plotting import figure, show
from bokeh.models import widgets
from bokeh.io import output_notebook, output_file
output_notebook()
output_file(“out.html”)
p = figure()p.line([1,2,3,4,5], [5,4,3,2,1])
show(p)
p = figure(width=800,height=300)
p.circle([1,2,3],[2,5,3], size=[10,20,30], color=["pink","olive","gold"])
show(p)

網頁元件與互動圖表

from IPython.display import IFrame
IFrame('https://demo.bokeh.org/sliders', width=900, height=500)

製作互動圖表的能力

製作互動圖表 — CustomJS 回調

#定義互動過程 (code=”””JavaScript代碼”””)
callback = CustomJS(args=dict(source=source), code="""
var data = source.data;
var f = cb_obj.value
var x = data['x']
var y = data['y']
for (var i = 0; i < x.length; i++) {
y[i] = Math.pow(x[i], f)
}
source.change.emit();
""")
#建立並給定部件名稱
slider = Slider(start=0.1, end=4, value=1, step=.1, title="power")
slider.js_on_change('value', callback)
# 建立頁面框架
layout = column(slider, plot)
# 結果呈現
show(layout)

CustomJS 選擇

# customjs for selectionsfrom random import randomfrom bokeh.layouts import row
from bokeh.models import ColumnDataSource, CustomJS
from bokeh.plotting import figure, output_file, show
# 儲存 HTML 檔案
output_file("callback.html")
x = [random() for x in range(500)]
y = [random() for y in range(500)]
s1 = ColumnDataSource(data=dict(x=x, y=y))
p1 = figure(plot_width=400, plot_height=400, tools="lasso_select", title="Select Here")
p1.circle('x', 'y', source=s1, alpha=0.6)
s2 = ColumnDataSource(data=dict(x=[], y=[]))
p2 = figure(plot_width=400, plot_height=400, x_range=(0, 1), y_range=(0, 1),
tools="", title="Watch Here")
p2.circle('x', 'y', source=s2, alpha=0.6)
s1.selected.js_on_change('indices', CustomJS(args=dict(s1=s1, s2=s2), code="""
var inds = cb_obj.indices;
var d1 = s1.data;
var d2 = s2.data;
d2['x'] = []
d2['y'] = []
for (var i = 0; i < inds.length; i++) {
d2['x'].push(d1['x'][inds[i]])
d2['y'].push(d1['y'][inds[i]])
}
s2.change.emit();
""")
)
layout = row(p1, p2)show(layout)

邊緣和節點渲染器 GraphRenderer

import networkx as nx#建立互動網路圖
G=nx.karate_club_graph()
#載入相關的套件
from bokeh.io import output_file, show
from bokeh.models import (BoxSelectTool, Circle, EdgesAndLinkedNodes, HoverTool,
MultiLine, NodesAndLinkedEdges, Plot, Range1d, TapTool,)
from bokeh.palettes import Spectral4
from bokeh.plotting import from_networkx
#建立互動網路圖
G=nx.karate_club_graph()
plot = Plot(plot_width=400, plot_height=400,
x_range=Range1d(-1.1,1.1), y_range=Range1d(-1.1,1.1))
plot.title.text = "Graph Interaction Demonstration"
plot.add_tools(HoverTool(tooltips=None), TapTool(), BoxSelectTool())graph_renderer = from_networkx(G, nx.circular_layout, scale=1, center=(0,0))#建立節點交互
graph_renderer.node_renderer.glyph
#建立邊緣回饋交互
graph_renderer.edge_renderer.glyph
#選擇策略
graph_renderer.selection_policy = NodesAndLinkedEdges()
graph_renderer.inspection_policy = EdgesAndLinkedNodes()
#繪製GRAPH
plot.renderers.append(graph_renderer)
#輸出
output_file("interactive_graphs.html")

D25 Basemap 進行地理資訊繪圖(略)

D26 使用 PANDAS 與 BASEMAP 將數據整合於地理資訊圖表(略)

理科與藝術交織成靈魂的會計人,喜愛戲劇與攝影,但也喜歡資料科學。

理科與藝術交織成靈魂的會計人,喜愛戲劇與攝影,但也喜歡資料科學。