如何在Python中计算马氏距离

马氏距离定义为两个给定点之间的距离，前提是它们位于多元空间中。该距离用于确定包含一堆变量的统计分析。

用户需要安装并导入以下库以在Python中计算马氏距离：

麻木的
熊猫
scipy

安装上述所有软件包的语法：

pip3 install numpy pandas scipy

第一步：第一步是导入上面安装的所有库。

Python3

# Importing libraries
  
import numpy as np
import pandas as pd 
import scipy as stats

Python3

# data 
data = { 'Price': [100000, 800000, 650000, 700000,
                   860000, 730000, 400000, 870000,
                   780000, 400000],
         'Distance': [16000, 60000, 300000, 10000,
                      252000, 350000, 260000, 510000,
                      2000, 5000],
         'Emission': [300, 400, 1230, 300, 400, 104,
                      632, 221, 142, 267],
         'Performance': [60, 88, 90, 87, 83, 81, 72, 
                         91, 90, 93],
         'Mileage': [76, 89, 89, 57, 79, 84, 78, 99, 
                     97, 99]
           }
  
# Creating dataset
df = pd.DataFrame(data,columns=['Price', 'Distance',
                                'Emission','Performance',
                                'Mileage'])

Python3

# Importing libraries
  
import numpy as np
import pandas as pd 
import scipy as stats
  
# calculateMahalanobis function to calculate
# the Mahalanobis distance
def calculateMahalanobis(y=None, data=None, cov=None):
  
    y_mu = y - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = np.linalg.inv(cov)
    left = np.dot(y_mu, inv_covmat)
    mahal = np.dot(left, y_mu.T)
    return mahal.diagonal()
  
# create new column in dataframe that contains 
# Mahalanobis distance for each row
df['calculateMahalanobis'] = mahalanobis(x=df, data=df[['Price', 'Distance',
                                                        'Emission','Performance',
                                                        'Mileage']])

Python3

# Importing libraries
  
import numpy as np
import pandas as pd
import scipy as stats
  
# calculateMahalanobis function to calculate
# the Mahalanobis distance
def calculateMahalanobis(y=None, data=None, cov=None):
  
    y_mu = y - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = np.linalg.inv(cov)
    left = np.dot(y_mu, inv_covmat)
    mahal = np.dot(left, y_mu.T)
    return mahal.diagonal()
  
# data
data = { 'Price': [100000, 800000, 650000, 700000, 
                   860000, 730000, 400000, 870000,
                   780000, 400000],
         'Distance': [16000, 60000, 300000, 10000, 
                      252000, 350000, 260000, 510000, 
                      2000, 5000],
         'Emission': [300, 400, 1230, 300, 400, 104,
                      632, 221, 142, 267],
         'Performance': [60, 88, 90, 87, 83, 81, 72, 
                         91, 90, 93],
         'Mileage': [76, 89, 89, 57, 79, 84, 78, 99, 
                     97, 99]
           }
  
# Creating dataset
df = pd.DataFrame(data,columns=['Price', 'Distance',
                                'Emission','Performance', 
                                'Mileage'])
  
# Creating a new column in the dataframe that holds
# the Mahalanobis distance for each row
df['calculateMahalanobis'] = calculateMahalanobis(y=df, data=df[[
  'Price', 'Distance', 'Emission','Performance', 'Mileage']])
  
# Display the dataframe
print(df)

Python3

# Importing libraries
  
import numpy as np
import pandas as pd
import scipy as stats
from scipy.stats import chi2
  
# calculateMahalanobis Function to calculate
# the Mahalanobis distance
def calculateMahalanobis(y=None, data=None, cov=None):
  
    y_mu = y - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = np.linalg.inv(cov)
    left = np.dot(y_mu, inv_covmat)
    mahal = np.dot(left, y_mu.T)
    return mahal.diagonal()
  
# data
data = { 'Price': [100000, 800000, 650000, 700000,
                   860000, 730000, 400000, 870000,
                   780000, 400000],
         'Distance': [16000, 60000, 300000, 10000, 
                      252000, 350000, 260000, 510000,
                      2000, 5000],
         'Emission': [300, 400, 1230, 300, 400, 104,
                      632, 221, 142, 267],
         'Performance': [60, 88, 90, 87, 83, 81, 72,
                         91, 90, 93],
         'Mileage': [76, 89, 89, 57, 79, 84, 78, 99,
                     97, 99]
           }
  
# Creating dataset
df = pd.DataFrame(data,columns=['Price', 'Distance',
                                'Emission','Performance',
                                'Mileage'])
  
# Creating a new column in the dataframe that holds
# the Mahalanobis distance for each row
df['Mahalanobis'] = calculateMahalanobis(y=df, data=df[[
  'Price', 'Distance', 'Emission','Performance', 'Mileage']])
  
# calculate p-value for each mahalanobis distance
df['p'] = 1 - chi2.cdf(df['Mahalanobis'], 3)
  
# display first five rows of dataframe
print(df)

第 2 步：创建数据集。考虑 10 辆不同品牌汽车的数据。数据有五个部分：

价格
距离
产生的排放
表现
里程

Python3

# data 
data = { 'Price': [100000, 800000, 650000, 700000,
                   860000, 730000, 400000, 870000,
                   780000, 400000],
         'Distance': [16000, 60000, 300000, 10000,
                      252000, 350000, 260000, 510000,
                      2000, 5000],
         'Emission': [300, 400, 1230, 300, 400, 104,
                      632, 221, 142, 267],
         'Performance': [60, 88, 90, 87, 83, 81, 72, 
                         91, 90, 93],
         'Mileage': [76, 89, 89, 57, 79, 84, 78, 99, 
                     97, 99]
           }
  
# Creating dataset
df = pd.DataFrame(data,columns=['Price', 'Distance',
                                'Emission','Performance',
                                'Mileage'])

第 3 步：确定每次观察的马氏距离。

Python3

# Importing libraries
  
import numpy as np
import pandas as pd 
import scipy as stats
  
# calculateMahalanobis function to calculate
# the Mahalanobis distance
def calculateMahalanobis(y=None, data=None, cov=None):
  
    y_mu = y - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = np.linalg.inv(cov)
    left = np.dot(y_mu, inv_covmat)
    mahal = np.dot(left, y_mu.T)
    return mahal.diagonal()
  
# create new column in dataframe that contains 
# Mahalanobis distance for each row
df['calculateMahalanobis'] = mahalanobis(x=df, data=df[['Price', 'Distance',
                                                        'Emission','Performance',
                                                        'Mileage']])

结合所有步骤：

例子：

Python3

# Importing libraries
  
import numpy as np
import pandas as pd
import scipy as stats
  
# calculateMahalanobis function to calculate
# the Mahalanobis distance
def calculateMahalanobis(y=None, data=None, cov=None):
  
    y_mu = y - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = np.linalg.inv(cov)
    left = np.dot(y_mu, inv_covmat)
    mahal = np.dot(left, y_mu.T)
    return mahal.diagonal()
  
# data
data = { 'Price': [100000, 800000, 650000, 700000, 
                   860000, 730000, 400000, 870000,
                   780000, 400000],
         'Distance': [16000, 60000, 300000, 10000, 
                      252000, 350000, 260000, 510000, 
                      2000, 5000],
         'Emission': [300, 400, 1230, 300, 400, 104,
                      632, 221, 142, 267],
         'Performance': [60, 88, 90, 87, 83, 81, 72, 
                         91, 90, 93],
         'Mileage': [76, 89, 89, 57, 79, 84, 78, 99, 
                     97, 99]
           }
  
# Creating dataset
df = pd.DataFrame(data,columns=['Price', 'Distance',
                                'Emission','Performance', 
                                'Mileage'])
  
# Creating a new column in the dataframe that holds
# the Mahalanobis distance for each row
df['calculateMahalanobis'] = calculateMahalanobis(y=df, data=df[[
  'Price', 'Distance', 'Emission','Performance', 'Mileage']])
  
# Display the dataframe
print(df)

输出：

计算每个马氏距离的 p 值

现在让我们计算数据集每个观测值的每个马氏距离的 p 值。正如您从上面的输出中看到的那样，一些马氏距离明显大于其他值。为了计算某些距离是否在统计上显着，我们需要找到它们的 p 值。每个距离的 p 值与属于自由度等于 k-1 的马氏距离的卡方统计量的 p 值相同，其中 k = 变量数。因此，在这种情况下，我们将使用 5-1 = 4 的自由度。

例子：

Python3

# Importing libraries
  
import numpy as np
import pandas as pd
import scipy as stats
from scipy.stats import chi2
  
# calculateMahalanobis Function to calculate
# the Mahalanobis distance
def calculateMahalanobis(y=None, data=None, cov=None):
  
    y_mu = y - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = np.linalg.inv(cov)
    left = np.dot(y_mu, inv_covmat)
    mahal = np.dot(left, y_mu.T)
    return mahal.diagonal()
  
# data
data = { 'Price': [100000, 800000, 650000, 700000,
                   860000, 730000, 400000, 870000,
                   780000, 400000],
         'Distance': [16000, 60000, 300000, 10000, 
                      252000, 350000, 260000, 510000,
                      2000, 5000],
         'Emission': [300, 400, 1230, 300, 400, 104,
                      632, 221, 142, 267],
         'Performance': [60, 88, 90, 87, 83, 81, 72,
                         91, 90, 93],
         'Mileage': [76, 89, 89, 57, 79, 84, 78, 99,
                     97, 99]
           }
  
# Creating dataset
df = pd.DataFrame(data,columns=['Price', 'Distance',
                                'Emission','Performance',
                                'Mileage'])
  
# Creating a new column in the dataframe that holds
# the Mahalanobis distance for each row
df['Mahalanobis'] = calculateMahalanobis(y=df, data=df[[
  'Price', 'Distance', 'Emission','Performance', 'Mileage']])
  
# calculate p-value for each mahalanobis distance
df['p'] = 1 - chi2.cdf(df['Mahalanobis'], 3)
  
# display first five rows of dataframe
print(df)

输出：

解释：

通常，p 值小于 0.001 的观察值被假定为异常值。在此示例中，没有异常值，因为所有 p 值都大于 0.001。