Computer Science‎ > ‎

Stats, ML, Data: Computing the Pearson Product-Moment Correlation Coefficient

Pearson product-moment correlation coefficient

The Person product-moment correlation coefficient is a measure of linear correlation described well on this Wikipedia page. The formula,  is given by: 

where x and y denote the two vectors between which the correlation is to be measured.


Correlation coefficients are used to identify a mutual relationship and/or interdependence between 2 or more variables. 


Say, you are given a file with N rows, indicating the scores of candidates in three subjects A, B, C (each on a new line, space separated).

You need to calculate the pearson coefficients between A and B, B and C, A and C. 


C++ Program to compute the pearson correlation coefficient between the pairs of variables 

#include <stdio.h>

#include <stdlib.h>

#include <vector>

#include <math.h>


using namespace std;


int main(){

int n;scanf("%d",&n);

vector<int> m(n),p(n),c(n);

for(int i=0;i<n;i++){

scanf("%d %d %d",&m[i],&p[i],&c[i]);

m[i]=m[i]+0.01;p[i]=p[i]+0.01;c[i]=c[i]+0.01;

}

long long int ms=0,ps=0,cs=0;

long long int nm=0,np=0,nc=0;

long long int mp=0,pc=0,cm=0;

for(int i=0;i<n;i++){

ms+=(long long int)m[i]*(long long int)m[i];ps+=(long long int)p[i]*(long long int)p[i];cs+=(long long int)c[i]*(long long int)c[i];

nm+=(long long int)m[i];np+=(long long int)p[i];nc+=(long long int)c[i];

mp+=(long long int)m[i]*(long long int)p[i];pc+=(long long int)p[i]*(long long int)c[i];cm+=(long long int)c[i]*(long long int)m[i];

}

long double mpcor=(((long double)n*(long double)mp-((long double)nm*(long double)np))*1.0)/((sqrt((long double)n*(long double)ms-(long double)nm*(long double)nm))*(sqrt((long double)n*(long double)ps-(long double)np*(long double)np)));

long double pccor=(((long double)n*(long double)pc-((long double)np*(long double)nc))*1.0)/((sqrt((long double)n*(long double)ps-(long double)np*(long double)np))*(sqrt((long double)n*(long double)cs-(long double)nc*(long double)nc)));

long double cmcor=(((long double)n*(long double)cm-((long double)nc*(long double)nm))*1.0)/((sqrt((long double)n*(long double)cs-(long double)nc*(long double)nc))*(sqrt((long double)n*(long double)ms-(long double)nm*(long double)nm)));

printf("%.2Lf\n%.2Lf\n%.2Lf",mpcor,pccor,cmcor);

return 0;

}





Java Program to compute the pearson coefficient between the pairs of variables 


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

public class Solution {
public static void main(String[] args) throws NumberFormatException, IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
int n = Integer.parseInt(br.readLine());
int M[] = new int[n];
int P[] = new int[n];
int C[] = new int[n];
for(int i=0; i<n; i++) {
String str[] = br.readLine().split("\t");
M[i] = Integer.parseInt(str[0]);
P[i] = Integer.parseInt(str[1]);
C[i] = Integer.parseInt(str[2]);
}
double mp = calculatePearsonProd(M, P);
double pc = calculatePearsonProd(P, C);
double cm = calculatePearsonProd(C, M);
System.out.println(String.format("%.2f", mp));
System.out.println(String.format("%.2f", pc));
System.out.println(String.format("%.2f", cm));
}
public static double calculatePearsonProd(int a[], int b[]) {
long sumA = sumArray(a);
long sumB = sumArray(b);
long sumAB = sumProductArray(a, b);
long sumA2 = squareSumArray(a);
long sumB2 = squareSumArray(b);
int n = a.length;
double ans = (n*sumAB - sumA*sumB)/(Math.sqrt(n*sumA2 - sumA*sumA) * Math.sqrt(n*sumB2 - sumB*sumB));
return ans;
}
public static long sumArray(int a[]) {
long sum = 0;
for(int i=0; i<a.length; i++)
sum += a[i];
return sum;
}
public static long squareSumArray(int a[]) {
long sum = 0;
for(int i=0; i<a.length; i++)
sum += a[i]*a[i];
return sum;
}
public static long sumProductArray(int a[], int b[]) {
long sum = 0;
for(int i=0; i<a.length; i++) 
sum += a[i]*b[i];
return sum;
}
}


Ruby Program to compute the Pearson Coefficients


def 
pearsonCor
(n, x, y, x2, y2, xy)
  res = (n*xy - x*y).to_f / Math.sqrt(n*x2 - x*x) / Math.sqrt(n*y2 - y*y)
  res.round(2)
end

x = x2 = y = y2 = z = z2 = xy = xz = yz = 0
total = gets.to_i
total.times do
  xi, yi, zi = gets.split(' ').map(&:to_i)
  x += xi
  y += yi
  z += zi

  x2 += xi*xi
  y2 += yi*yi
  z2 += zi*zi

  xy += xi*yi
  xz += xi*zi
  yz += yi*zi
end

puts pearsonCor(total, x, y, x2, y2, xy)
puts 
pearsonCor
(total, z, y, z2, y2, yz)
puts 
pearsonCor
(total, x, z, x2, z2, xz)


Python Program to compute the pearson coefficient between the pairs of variables 


import math

def average(x):
    assert len(x) > 0
    return float(sum(x)) / len(x)

def pearson_def(x, y):
    assert len(x) == len(y)
    n = len(x)
    assert n > 0
    avg_x = average(x)
    avg_y = average(y)
    diffprod = 0
    xdiff2 = 0
    ydiff2 = 0
    for idx in range(n):
        xdiff = x[idx] - avg_x
        ydiff = y[idx] - avg_y
        diffprod += xdiff * ydiff
        xdiff2 += xdiff * xdiff
        ydiff2 += ydiff * ydiff

    return diffprod / math.sqrt(xdiff2 * ydiff2)


T = int(input())
arr1 = []
arr2 = []
arr3 = []
for i in range(0,T):
    x,y,z = a = map(int, input().split())
    arr1.append(x)
    arr2.append(y)
    arr3.append(z)
    
print("%.2f" % pearson_def(arr1,arr2))
print("%.2f" % pearson_def(arr2,arr3))
print("%.2f" % pearson_def(arr3,arr1))