如何使用 Pandas 绘制基于时间序列的图表?
在一段时间内收集的一系列数据点,并按时间索引被称为时间序列数据。这些观察结果记录在连续等间隔的时间点上。例如,ECG 信号、EEG 信号、股票市场、天气数据等,都是在一段时间内进行时间索引和记录的。分析这些数据并预测未来的观察具有更广泛的研究范围。
在本文中,我们将看到如何在Python使用 Pandas 库实现 EDA——探索性数据分析。我们将尝试通过使用 matplotlib.pyplot、seaborn、statsmodels 和更多包绘制各种图表来推断特定时间段内数据的性质。
为了便于理解绘图和其他函数,我们将创建一个 16 行 5 列的示例数据集,其中包括日期、A、B、C、D 和 E 列。
Python3
import pandas as pd
# Sample data which will be used
# to create the dataframe
sample_timeseries_data = {
'Date': ['2020-01-25', '2020-02-25',
'2020-03-25', '2020-04-25',
'2020-05-25', '2020-06-25',
'2020-07-25', '2020-08-25',
'2020-09-25', '2020-10-25',
'2020-11-25', '2020-12-25',
'2021-01-25', '2021-02-25',
'2021-03-25', '2021-04-25'],
'A': [102, 114, 703, 547,
641, 669, 897, 994,
1002, 974, 899, 954,
1105, 1189, 1100, 934],
'B': [1029, 1178, 723, 558,
649, 669, 899, 1000,
1012, 984, 918, 959,
1125, 1199, 1109, 954],
'C': [634, 422,152, 23,
294, 1452, 891, 990,
924, 960, 874, 548,
174, 49, 655, 914],
'D': [1296, 7074, 3853, 4151,
2061, 1478, 2061, 3853,
6379, 2751, 1064, 6263,
2210, 6566, 3918, 1121],
'E': [10, 17, 98, 96,
85, 89, 90, 92,
86, 84, 78, 73,
71, 65, 70, 60]
}
# Creating a dataframe using pandas
# module with Date, A, B, C, D and E
# as columns.
dataframe = pd.DataFrame(
sample_timeseries_data,columns=[
'Date', 'A', 'B', 'C', 'D', 'E'])
# Changing the datatype of Date, from
# Object to datetime64
dataframe["Date"] = dataframe["Date"].astype("datetime64")
# Setting the Date as index
dataframe = dataframe.set_index("Date")
dataframe
Python3
import matplotlib.pyplot as plt
# Using a inbuilt style to change
# the look and feel of the plot
plt.style.use("fivethirtyeight")
# setting figure size to 12, 10
plt.figure(figsize=(12, 10))
# Labelling the axes and setting
# a title
plt.xlabel("Date")
plt.ylabel("Values")
plt.title("Sample Time Series Plot")
# plotting the "A" column alone
plt.plot(dataframe["A"])
Python3
plt.style.use("fivethirtyeight")
dataframe.plot(subplots=True, figsize=(12, 15))
Python3
import matplotlib.pyplot as plt
# Using a inbuilt style to change
# the look and feel of the plot
plt.style.use("fivethirtyeight")
# setting figure size to 12, 10
plt.figure(figsize=(15, 10))
# Labelling the axes and setting a
# title
plt.xlabel("Date")
plt.ylabel("Values")
plt.title("Bar Plot of 'A'")
# plotting the "A" column alone
plt.bar(dataframe.index, dataframe["A"], width=5)
Python3
dataframe.rolling(window = 5).mean()
Python3
import matplotlib.pyplot as plt
# Using a inbuilt style to change
# the look and feel of the plot
plt.style.use("fivethirtyeight")
# setting figure size to 12, 10
plt.figure(figsize=(12, 10))
# Labelling the axes and setting
# a title
plt.xlabel("Date")
plt.ylabel("Values")
plt.title("Values of 'A' and Rolling Mean (2) Plot")
# plotting the "A" column and "A" column
# of Rolling Dataframe (window_size = 20)
plt.plot(dataframe["A"])
plt.plot(dataframe.rolling(
window=2, min_periods=1).mean()["A"])
Python3
import statsmodels.api as sm
from pylab import rcParams
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Separating the Date Component into
# Year and Month
dataframe['Date'] = dataframe.index
dataframe['Year'] = dataframe['Date'].dt.year
dataframe['Month'] = dataframe['Date'].dt.month
# using inbuilt style
plt.style.use("fivethirtyeight")
# Creating a dataframe with "Date" and "A"
# columns only. This dataframe is date indexed
decomposition_dataframe = dataframe[['Date', 'A']].copy()
decomposition_dataframe.set_index('Date', inplace=True)
decomposition_dataframe.index = pd.to_datetime(decomposition_dataframe.index)
# using sm.tsa library, we are plotting the
# seasonal decomposition of the "A" column
# Multiplicative Model : Y[t] = T[t] * S[t] * R[t]
decomposition = sm.tsa.seasonal_decompose(decomposition_dataframe,
model='multiplicative', freq=5)
decomp = decomposition.plot()
decomp.suptitle('"A" Value Decomposition')
# changing the runtime configuration parameters to
# have a desired plot of desired size, etc
rcParams['figure.figsize'] = 12, 10
rcParams['axes.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
rcParams['xtick.labelsize'] = 12
Python3
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(dataframe['A'])
Python3
# Splitting the plot into (1,2) subplots
# and initializing them using fig and ax
# variables
fig, ax = plt.subplots(nrows=1, ncols=2,
figsize=(15, 6))
# Using Seaborn Library for Box Plot
sns.boxplot(dataframe['Year'],
dataframe["A"], ax=ax[0])
# Defining the title and axes names
ax[0].set_title('Year-wise Box Plot for A',
fontsize=20, loc='center')
ax[0].set_xlabel('Year')
ax[0].set_ylabel('"A" values')
# Using Seaborn Library for Box Plot
sns.boxplot(dataframe['Month'],
dataframe["A"], ax=ax[1])
# Defining the title and axes names
ax[1].set_title('Month-wise Box Plot for A',
fontsize=20, loc='center')
ax[1].set_xlabel('Month')
ax[1].set_ylabel('"A" values')
# rotate the ticks and right align them
fig.autofmt_xdate()
Python3
dataframe['Change'] = dataframe.A.div(dataframe.A.shift())
dataframe['Change'].plot(figsize=(15, 10),
xlabel = "Date",
ylabel = "Value Difference",
title = "Shift Plot")
Python3
import calendar
import seaborn as sns
import pandas as pd
dataframe['Date'] = dataframe.index
# Splitting the Date into Year and Month
dataframe['Year'] = dataframe['Date'].dt.year
dataframe['Month'] = dataframe['Date'].dt.month
# Creating a Pivot Table with "A"
# column values and is Month indexed.
table_df = pd.pivot_table(dataframe, values=["A"],
index=["Month"],
columns=["Year"],
fill_value=0,
margins=True)
# Naming the index, can be generated
# using calendar.month_abbr[i]
mon_name = [['Jan', 'Feb', 'Mar', 'Apr',
'May', 'Jun', 'Jul', 'Aug',
'Sep','Oct', 'Nov', 'Dec', 'All']]
# Indexing using Month Names
table_df = table_df.set_index(mon_name)
# Creating a heatmap using sns with Red,
# Yellow & Green Colormap.
ax = sns.heatmap(table_df, cmap='RdYlGn_r',
robust=True, fmt='.2f',
annot=True, linewidths=.6,
annot_kws={'size':10},
cbar_kws={'shrink':.5,
'label':'"A" values'})
# Setting the Tick Labels, Title and x & Y labels
ax.set_yticklabels(ax.get_yticklabels())
ax.set_xticklabels(ax.get_xticklabels())
plt.title('"A" Value Analysis', pad=14)
plt.xlabel('Year')
plt.ylabel('Months')
输出:
绘制时间序列数据
绘制基于时间序列的折线图:
折线图用于表示不同轴上两个数据 X 和 Y 之间的关系。
Syntax: plt.plot(x)
示例 1:此图显示了从 2020 年 1 月到 2020 年 4 月 A 列值的变化。请注意,这些值总体上呈正趋势,但在此过程中会有起伏。
蟒蛇3
import matplotlib.pyplot as plt
# Using a inbuilt style to change
# the look and feel of the plot
plt.style.use("fivethirtyeight")
# setting figure size to 12, 10
plt.figure(figsize=(12, 10))
# Labelling the axes and setting
# a title
plt.xlabel("Date")
plt.ylabel("Values")
plt.title("Sample Time Series Plot")
# plotting the "A" column alone
plt.plot(dataframe["A"])
输出:
示例 2:绘制所有变量。
蟒蛇3
plt.style.use("fivethirtyeight")
dataframe.plot(subplots=True, figsize=(12, 15))
输出:
绘制基于时间序列的条形图:
条形图或条形图是表示数据类别的图形,矩形条的长度和高度与它们所代表的值成正比。条形图可以水平或垂直绘制。条形图描述了离散类别之间的比较。图的一个轴代表正在比较的特定类别,而另一个轴代表与这些类别对应的测量值。
Syntax: plt.bar(x, height, width, bottom, align)
此条形图表示“A”列值的变化。这可用于比较未来值和快速值。
蟒蛇3
import matplotlib.pyplot as plt
# Using a inbuilt style to change
# the look and feel of the plot
plt.style.use("fivethirtyeight")
# setting figure size to 12, 10
plt.figure(figsize=(15, 10))
# Labelling the axes and setting a
# title
plt.xlabel("Date")
plt.ylabel("Values")
plt.title("Bar Plot of 'A'")
# plotting the "A" column alone
plt.bar(dataframe.index, dataframe["A"], width=5)
输出:
绘制基于时间序列的滚动均值图:
从数据帧的开头到结尾滑动的 n 大小窗口的平均值称为滚动平均值。如果窗口没有 n 个观测值,则返回 NaN。
Syntax: pandas.DataFrame.rolling(n).mean()
例子:
蟒蛇3
dataframe.rolling(window = 5).mean()
输出:
在这里,我们将使用滚动均值图绘制时间序列:
蟒蛇3
import matplotlib.pyplot as plt
# Using a inbuilt style to change
# the look and feel of the plot
plt.style.use("fivethirtyeight")
# setting figure size to 12, 10
plt.figure(figsize=(12, 10))
# Labelling the axes and setting
# a title
plt.xlabel("Date")
plt.ylabel("Values")
plt.title("Values of 'A' and Rolling Mean (2) Plot")
# plotting the "A" column and "A" column
# of Rolling Dataframe (window_size = 20)
plt.plot(dataframe["A"])
plt.plot(dataframe.rolling(
window=2, min_periods=1).mean()["A"])
输出:
解释:
- 蓝色绘图线代表原始的“A”列值,而红色绘图线代表窗口大小 = 2 的“A”列值的滚动平均值
- 通过此图,我们推断时间序列数据的滚动平均值返回波动较小的值。保留了绘图的趋势,但丢弃了不那么重要的不需要的起伏。
- 对于绘制时间序列数据的分解、箱线图分析等,使用滚动均值数据框是一个很好的做法,这样波动就不会影响分析,尤其是在预测趋势时。
时间序列分解:
它在同一个图中显示了观察结果和这四个元素:
- 趋势组件:它显示跨越不同季节期间的数据模式。它代表“A”值在 2 年内没有波动的变化。
- 季节性成分:此图显示了“A”值的起伏,即反复出现的正常变化。
- 残差分量:这是将“A”值数据分解为趋势和季节性分量后的剩余分量。
- 观察到的成分:此趋势和季节性成分可用于出于各种目的研究数据。
例子:
蟒蛇3
import statsmodels.api as sm
from pylab import rcParams
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Separating the Date Component into
# Year and Month
dataframe['Date'] = dataframe.index
dataframe['Year'] = dataframe['Date'].dt.year
dataframe['Month'] = dataframe['Date'].dt.month
# using inbuilt style
plt.style.use("fivethirtyeight")
# Creating a dataframe with "Date" and "A"
# columns only. This dataframe is date indexed
decomposition_dataframe = dataframe[['Date', 'A']].copy()
decomposition_dataframe.set_index('Date', inplace=True)
decomposition_dataframe.index = pd.to_datetime(decomposition_dataframe.index)
# using sm.tsa library, we are plotting the
# seasonal decomposition of the "A" column
# Multiplicative Model : Y[t] = T[t] * S[t] * R[t]
decomposition = sm.tsa.seasonal_decompose(decomposition_dataframe,
model='multiplicative', freq=5)
decomp = decomposition.plot()
decomp.suptitle('"A" Value Decomposition')
# changing the runtime configuration parameters to
# have a desired plot of desired size, etc
rcParams['figure.figsize'] = 12, 10
rcParams['axes.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
rcParams['xtick.labelsize'] = 12
输出:
绘制基于时间序列的自相关图:
它是用于检查数据集中随机性的常用工具。这种随机性是通过计算不同时间滞后的数据值的自相关来确定的。它显示了一种称为时间序列的数据类型的属性。大多数通用统计软件程序都提供这些图。可以使用 pandas.plotting.autocorrelation_plot() 绘制它。
Syntax: pandas.plotting.autocorrelation_plot(series, ax=None, **kwargs)
Parameters:
- series: This parameter is the Time series to be used to plot.
- ax: This parameter is a matplotlib axes object. Its default value is None.
Returns: This function returns an object of class matplotlip.axis.Axes
考虑到趋势、季节性、循环和残差,该图显示时间序列数据的当前值与先前值相关。我们可以看到,显着比例的线显示出与时间的有效相关性,我们可以使用这样的相关性图来研究时间序列数据的内在相关性。
代码:
蟒蛇3
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(dataframe['A'])
输出:
绘制基于时间序列的箱线图:
箱线图是通过四分位数描述的数值数据组的可视化表示。 Boxplot 还用于检测数据集中的异常值。它使用简单的盒子和胡须有效地捕获数据摘要,并允许我们轻松地进行跨组比较。 Boxplot 使用第 25 个、第 50 个和第 75 个百分位数汇总样本数据。
Syntax: seaborn.boxplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None, orient=None, color=None, palette=None, saturation=0.75, width=0.8, dodge=True, fliersize=5, linewidth=None, whis=1.5, ax=None, **kwargs)
Parameters:
x, y, hue: Inputs for plotting long-form data.
data: Dataset for plotting. If x and y are absent, this is interpreted as wide-form.
color: Color for all of the elements.
Returns: It returns the Axes object with the plot drawn onto it.
在这里,通过这些图,我们将能够直观地了解每年(Year-wise Box Plot)和每个月(Month-wise Box Plot)的“A”值范围。此外,通过 Month-wise Box Plot,我们可以观察到,与其他月份相比,1 月和 2 月的值范围略高。
蟒蛇3
# Splitting the plot into (1,2) subplots
# and initializing them using fig and ax
# variables
fig, ax = plt.subplots(nrows=1, ncols=2,
figsize=(15, 6))
# Using Seaborn Library for Box Plot
sns.boxplot(dataframe['Year'],
dataframe["A"], ax=ax[0])
# Defining the title and axes names
ax[0].set_title('Year-wise Box Plot for A',
fontsize=20, loc='center')
ax[0].set_xlabel('Year')
ax[0].set_ylabel('"A" values')
# Using Seaborn Library for Box Plot
sns.boxplot(dataframe['Month'],
dataframe["A"], ax=ax[1])
# Defining the title and axes names
ax[1].set_title('Month-wise Box Plot for A',
fontsize=20, loc='center')
ax[1].set_xlabel('Month')
ax[1].set_ylabel('"A" values')
# rotate the ticks and right align them
fig.autofmt_xdate()
输出:
班次分析:
该图通过将 'A' 列的当前值除以 'A' 列的移位值而获得。默认 Shift 为一个值。该图用于分析每天的价值稳定性。
蟒蛇3
dataframe['Change'] = dataframe.A.div(dataframe.A.shift())
dataframe['Change'].plot(figsize=(15, 10),
xlabel = "Date",
ylabel = "Value Difference",
title = "Shift Plot")
输出:
绘制基于时间序列的热图:
我们可以解释“A”列值在 12 个月内采样的年份的趋势、不同年份的值变化等。我们还可以推断这些值与平均值的变化情况。这个热图是一个非常有用的可视化。此热图显示了跨年和跨月的温度变化,使用颜色图进行区分。
蟒蛇3
import calendar
import seaborn as sns
import pandas as pd
dataframe['Date'] = dataframe.index
# Splitting the Date into Year and Month
dataframe['Year'] = dataframe['Date'].dt.year
dataframe['Month'] = dataframe['Date'].dt.month
# Creating a Pivot Table with "A"
# column values and is Month indexed.
table_df = pd.pivot_table(dataframe, values=["A"],
index=["Month"],
columns=["Year"],
fill_value=0,
margins=True)
# Naming the index, can be generated
# using calendar.month_abbr[i]
mon_name = [['Jan', 'Feb', 'Mar', 'Apr',
'May', 'Jun', 'Jul', 'Aug',
'Sep','Oct', 'Nov', 'Dec', 'All']]
# Indexing using Month Names
table_df = table_df.set_index(mon_name)
# Creating a heatmap using sns with Red,
# Yellow & Green Colormap.
ax = sns.heatmap(table_df, cmap='RdYlGn_r',
robust=True, fmt='.2f',
annot=True, linewidths=.6,
annot_kws={'size':10},
cbar_kws={'shrink':.5,
'label':'"A" values'})
# Setting the Tick Labels, Title and x & Y labels
ax.set_yticklabels(ax.get_yticklabels())
ax.set_xticklabels(ax.get_xticklabels())
plt.title('"A" Value Analysis', pad=14)
plt.xlabel('Year')
plt.ylabel('Months')
输出: