import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_csv('global_data.csv')
df.head()
year | avg_temp | |
---|---|---|
0 | 1750 | 8.72 |
1 | 1751 | 7.98 |
2 | 1752 | 5.78 |
3 | 1753 | 8.39 |
4 | 1754 | 8.47 |
# this returns a tuple of the dimensions of the dataframe
df.shape
# this returns the datatypes of the columns
df.dtypes
# this displays a concise summary of the dataframe,
# including the number of non-null values in each column
df.info()
# this returns the number of unique values in each column
df.nunique()
# this returns useful descriptive statistics for each column of data
df.describe()
df.shape
red_df.isnull().sum()
white_df.duplicated().sum()
# check for duplicates in the data
sum(df.duplicated())
# use means to fill in missing values
df.fillna(value='FILL VALUE')
mean = df['texture_mean'].mean()
df.fillna(df.mean(),inplace=True)
# drop duplicates
df.drop_duplicates(inplace=True)
# confirm correction by rechecking for duplicates in the data
sum(df.duplicated())
# sales for the last month
df.iloc[196:, 1:].sum().plot(kind='bar');
# average sales
df.mean().plot(kind='pie');
# sales for the week of March 13th, 2016
sales = df[df['week'] == '2016-03-13']
sales.iloc[0, 1:].plot(kind='bar');
# sales for the lastest 3-month periods
last_three_months = df[df['week'] >= '2017-12-01']
last_three_months.iloc[:, 1:].sum().plot(kind='pie')
# sales on march 13, 2016
df[df['week'] == '2016-03-13']
# worst week for store C
df[df['store C'] == df['store C'].min()]
# total sales during most recent 3 month period
last_three_months = df[df['week'] >= '2017-12-01']
last_three_months.iloc[:, 1:].sum() # exclude sum of week column
# plot relationship between temperature and electrical output
df.plot(x='temperature', y='energy_output', kind='scatter');
# plot distribution of humidity
df['humidity'].hist();
# plot box plots for each variable
df['temperature'].plot(kind='box');
df_census['education'].value_counts()
# We can call the plot function on this result to create a bar chart.
df_census['education'].value_counts().plot(kind='bar');
#Similarly, we also need value_counts to plot pie charts.
df_census['workclass'].value_counts().plot(kind='pie', figsize=(8, 8));
df.plot(x='concavity_mean', y='concavity_se', kind='scatter');
df['concave points_worst'].plot(kind='box')
global_temp = pd.read_csv('global_data.csv') # importing 'global tempreature data'
city_temp = pd.read_csv('city_data1.csv') # importing 'city tempreature data1' which is a data for one Hansa Hamburg over years.
glb_mv_avg = globaltemp['avg_temp'].rolling(10).mean()
local_mv_avg = citytemp['avg_temp'].rolling(10).mean()
#Local Data is as same as Hansa Hamburg
plt.plot(global_temp['year'],glb_mv_avg,label='Global')
plt.plot(city_temp['year'],local_mv_avg,label='Hamburg')
plt.legend()
plt.xlabel("Years")
plt.ylabel("Temperature (°C)")
plt.title("Hansa Hamburg Average Temperature")
plt.rcParams.update({ 'font.size' : 10 })
plt.show()
#Local Data is as same as Hansa Hamburg
plt.plot(city_temp['year'],local_mv_avg,label='Hamburg')
plt.legend()
plt.xlabel("Years")
plt.ylabel("Temperature (°C)")
plt.title("Hansa Hamburg Average Temperature")
plt.rcParams.update({ 'font.size' : 10 })
plt.show()
#Local Data is as same as Hansa Hamburg
plt.plot(global_temp['year'],glb_mv_avg,label='Global')
plt.legend()
plt.xlabel("Years")
plt.ylabel("Temperature (°C)")
plt.title("Hansa Hamburg Average Temperature")
plt.rcParams.update({ 'font.size' : 10 })
plt.show()
# plot relationship between temperature and electrical output
df.plot(x='temperature', y='energy_output', kind='scatter');
sns.catplot(x='Pclass', y='Age', data=titanic)
titanic = pd.read_csv('titanic.csv')
sns.pairplot(df)
sns.pairplot(df, hue="Day")
titanic = sns.boxenplot(x="Pclass", y="Age", data=titanic)
sns.catplot(x='Pclass', y='Age', data=titanic, hue='Sex')
sns.boxplot(x='Pclass', y='Age', data=titanic, hue='Sex')
sns.jointplot('Age', 'Fare', data=titanic, kind='kde')
sns.jointplot('Age', 'Fare', data=titanic, kind='hex')