Ranking

Remove Duplicates - on a column

remove duplicates on a particular column

Adjust the remaining columns based on duplication applied column

It removes duplicate values and returns unique values

import pandas as pd
rows = [(14,15,16),(14,18,22),(22,18,17),(14,18,22)]
headers=['Apple','Banana','Grapes']
df = pd.DataFrame(rows, columns = headers)
print(df)
q = df.drop_duplicates(['Apple'])
print(q)

Duplicate particular row

It returns whether a particular column is duplicate or not
If the row is duplicate, it returns TRUE, else returns FALSE

import pandas as pd
rows = [(14,15,16),(14,18,22),(22,18,17),(14,18,22)]
headers=['Apple','Banana','Grapes']
df = pd.DataFrame(rows, columns = headers)
q = df.duplicated()
print(q)

Add a column to DataFrame with amount

import pandas as pd
df = pd.DataFrame([5,6], index = ['apple','banana'])
df['grapes']=35
print(df)

Add serial number to the newly created

import pandas as pd
import numpy as np
df = pd.DataFrame([5,6], index = ['apple','banana'])
df['grapes']=np.arange(2)
print(df)

Delete a column

del df['columnHeader']

import pandas as pd
df = pd.DataFrame([5,6,7,6], index = ['apple','banana','orange','Grapes'])
df1 = df.reindex(['apple','Grapes','orange','banana'])
print(df1)

Data Extraction

Note:df.ix was Depricated

Basic Info

Returns number of rows

print(len(df))

Rows in first column

FirstcolumnRows = df[df.columns[0]]
rowscount = FirstcolumnRows.count()
print(rowscount)

Returns rows and columns of dataframe

print(df.shape)

Returns data type of each column

print(df.dtypes)

Print the columns

print(df.columns)

Print columns in a List

print(list(df.columns))

rename a column name and keep in same place

df.rename(columns = {'sales' : 'salePrice', 'purchase' : 'purchasePrice'}, inplace=True)

Print the columns

print(df.columns)

Provides the information about the data frame

print(df.info())

drop a column

Drop a column and print the remaining dataframe

print(df.drop('sales', axis=1).head(11))
df.drop(['Sales', 'Purchase'], axis=1, inplace=True)

Drops all rows that consists of null values

df.dropna()

Drops all columns consists of null values

df.dropna(axis=1)

Drops all the rows having less than value 100

df.dropna(axis=1,thresh=100)

Fill null values with 1500

df.fillna(1500)

Fill null values with std

df.fillna(df.std())

Converts the data type to float

df['Sales'].astype(float)

Print row as series

print(df.ix[1])

Print all rows

print(df.ix[:])

Print First five rows

print(df.ix[0:4])

Print five rows from second row of PRICE column

print(df.ix[1:5,['Price']])

Print First four rows of PRICE and SYMBOLS columns

print(df.ix[0:3,['Symbols','Price']])

Print Every alternate row, 0,2,4,6....

print(df.ix[::2])

Print second row and 7th column value

print(df.ix[1,6])

Create new dataframe with second and 6th rows and print the result

df1 = df.ix[[1,5]]

Create new data frame with second and 6th columns and Open and Low columns

df1 = df.ix[[1,5],['Open', 'Low']]
print(df1)

Print rows without order

print(df.ix[[4,8,3,1]])

iloc

rows count starts from zero

Print 1,2,3 rows

print(df.iloc[1:4])

Print 0,1,2,3 rows

print(df.iloc[0:4])

Print rows without order\sequence

print(df.iloc[[1,5,4,8,2]])

Print 1,2,3 rows and 6,7,8 columns

print(df.iloc[1:4, 6:9])

loc

Print required rows and columns

print(df.loc[[1,8,3,4],['Price','sales']])

Print upto 5(0,1,2,3,4,5) rows with columns range

print(df.loc[:5,'Price':'sales'])

From 5th row to last row and from sales column to last column

print(df.loc[5:,'sales':])

Range of rows and defined columns

print(df.loc[3:8,['Sales','Price','Purchases']])

Store the row numbers and column numbers into variables

RowNumbers = [1,5,8,4,3]
ColumnNumbers = ['Sales','Purchases','Expenses','Profit']
print(df.loc[RowNumbers,ColumnNumbers])

change the column data based on headers

df['Symbols'] = 'Hello'
df['Open'] = 180
print(df)

Filtering the data

Rows greater than 1500 in sales column

df1 = df[df['sales'] > 1500]
print(df1)

Print range values - between Min and Max

df1 = df[(df['Sales'] > 1500) & (df['Sales'] < 3000)]
print(df1)

Sort the column values

df1 = df.sort_values('Sales')
print(df1['Open'].head())

Sort the column values in descending order

df1 = df.sort_values('Open',ascending=False)
print(df1['Open'].head())

Change the order of a column permanently

df.sort_values('Open',ascending=False, inplace=True)
print(df['Open'].head())

Statistics

Denotes about the summary of statistics for the numerical column

print(df.describe())

To a particular column

print(df['Sales'].describe())

Mean to all the columns

print(df.mean())

Returns correlation between the columns in dataframe

print(df.corr())

Returns count to all the columns\ required columns of data frame

It considers non null values

print(df.count())
print(df['Symbols'].count())

Returns max value to all the columns in dataframe

print(df.max())

Returns min value to all the columns in dataframe

print(df.min())

Returns median to each column of dataframe

print(df.median())

Returns standard deviation to each column of dataframe

print(df.std())

Append\Join

It appends the second dataframe at the end of first dataframe
Columns should be same

df1 = df['Symbols'].head(6)
df2 = df['Symbols'].tail(3)
print(df1.append(df2))

It appends the second dataframe after first dataframe
Rows should be same

df1 = df['Symbols'].head(6)
df2 = df['High'].head(3)
print(pd.concat([df1, df2],axis=1))

Join Types

df1 = print(df.ix[1:5,['Symbols','High']])
df2 = print(df.ix[1:5,['Symbols','LTP']])
print(df1.join(df2,on='Symbols',how='Left'))

Replace

Replace hunderd with five

df.replace(100,'five')

replace 15 with first, 28 with second, 34 with third

df.replace([15,28,34],['first','second','Third'])

Rename a column

df.rename(columns={'old_name': 'new_name'})

Text Files

df = pd.read_csv('sales.txt')
data.to_csv('static/sales.csv')
print(df)