Remove Duplicates - on a column
- remove duplicates on a particular column
- Adjust the remaining columns based on duplication applied column
- It removes duplicate values and returns unique values
import pandas as pd
rows = [(14,15,16),(14,18,22),(22,18,17),(14,18,22)]
headers=['Apple','Banana','Grapes']
df = pd.DataFrame(rows, columns = headers)
print(df)
q = df.drop_duplicates(['Apple'])
print(q)
- It returns whether a particular column is duplicate or not
- If the row is duplicate, it returns TRUE, else returns FALSE
import pandas as pd
rows = [(14,15,16),(14,18,22),(22,18,17),(14,18,22)]
headers=['Apple','Banana','Grapes']
df = pd.DataFrame(rows, columns = headers)
q = df.duplicated()
print(q)
Add a column to DataFrame with amount
import pandas as pd
df = pd.DataFrame([5,6], index = ['apple','banana'])
df['grapes']=35
print(df)
Add serial number to the newly created
import pandas as pd
import numpy as np
df = pd.DataFrame([5,6], index = ['apple','banana'])
df['grapes']=np.arange(2)
print(df)
- del df['columnHeader']
import pandas as pd
df = pd.DataFrame([5,6,7,6], index = ['apple','banana','orange','Grapes'])
df1 = df.reindex(['apple','Grapes','orange','banana'])
print(df1)
Note:df.ix was Depricated
Returns number of rows
- print(len(df))
Rows in first column
FirstcolumnRows = df[df.columns[0]]
rowscount = FirstcolumnRows.count()
print(rowscount)
Returns rows and columns of dataframe
- print(df.shape)
Returns data type of each column
- print(df.dtypes)
Print the columns
- print(df.columns)
Print columns in a List
- print(list(df.columns))
rename a column name and keep in same place
- df.rename(columns = {'sales' : 'salePrice', 'purchase' : 'purchasePrice'}, inplace=True)
Print the columns
- print(df.columns)
Provides the information about the data frame
- print(df.info())
Drop a column and print the remaining dataframe
- print(df.drop('sales', axis=1).head(11))
- df.drop(['Sales', 'Purchase'], axis=1, inplace=True)
Drops all rows that consists of null values
- df.dropna()
Drops all columns consists of null values
- df.dropna(axis=1)
Drops all the rows having less than value 100
- df.dropna(axis=1,thresh=100)
Fill null values with 1500
- df.fillna(1500)
Fill null values with std
- df.fillna(df.std())
Converts the data type to float
- df['Sales'].astype(float)
Print row as series
- print(df.ix[1])
Print all rows
- print(df.ix[:])
Print First five rows
- print(df.ix[0:4])
Print five rows from second row of PRICE column
- print(df.ix[1:5,['Price']])
Print First four rows of PRICE and SYMBOLS columns
- print(df.ix[0:3,['Symbols','Price']])
Print Every alternate row, 0,2,4,6....
- print(df.ix[::2])
Print second row and 7th column value
- print(df.ix[1,6])
Create new dataframe with second and 6th rows and print the result
- df1 = df.ix[[1,5]]
Create new data frame with second and 6th columns and Open and Low columns
- df1 = df.ix[[1,5],['Open', 'Low']]
- print(df1)
Print rows without order
- print(df.ix[[4,8,3,1]])
rows count starts from zero
Print 1,2,3 rows
- print(df.iloc[1:4])
Print 0,1,2,3 rows
- print(df.iloc[0:4])
Print rows without order\sequence
- print(df.iloc[[1,5,4,8,2]])
Print 1,2,3 rows and 6,7,8 columns
- print(df.iloc[1:4, 6:9])
Print required rows and columns
- print(df.loc[[1,8,3,4],['Price','sales']])
Print upto 5(0,1,2,3,4,5) rows with columns range
- print(df.loc[:5,'Price':'sales'])
From 5th row to last row and from sales column to last column
- print(df.loc[5:,'sales':])
Range of rows and defined columns
- print(df.loc[3:8,['Sales','Price','Purchases']])
Store the row numbers and column numbers into variables
- RowNumbers = [1,5,8,4,3]
- ColumnNumbers = ['Sales','Purchases','Expenses','Profit']
- print(df.loc[RowNumbers,ColumnNumbers])
change the column data based on headers
df['Symbols'] = 'Hello'
df['Open'] = 180
print(df)
Rows greater than 1500 in sales column
df1 = df[df['sales'] > 1500]
print(df1)
Print range values - between Min and Max
df1 = df[(df['Sales'] > 1500) & (df['Sales'] < 3000)]
print(df1)
Sort the column values
df1 = df.sort_values('Sales')
print(df1['Open'].head())
Sort the column values in descending order
df1 = df.sort_values('Open',ascending=False)
print(df1['Open'].head())
Change the order of a column permanently
df.sort_values('Open',ascending=False, inplace=True)
print(df['Open'].head())
Denotes about the summary of statistics for the numerical column
- print(df.describe())
To a particular column
- print(df['Sales'].describe())
Mean to all the columns
- print(df.mean())
Returns correlation between the columns in dataframe
- print(df.corr())
Returns count to all the columns\ required columns of data frame
- It considers non null values
print(df.count())
print(df['Symbols'].count())
Returns max value to all the columns in dataframe
- print(df.max())
Returns min value to all the columns in dataframe
- print(df.min())
Returns median to each column of dataframe
- print(df.median())
Returns standard deviation to each column of dataframe
- print(df.std())
- It appends the second dataframe at the end of first dataframe
- Columns should be same
df1 = df['Symbols'].head(6)
df2 = df['Symbols'].tail(3)
print(df1.append(df2))
- It appends the second dataframe after first dataframe
- Rows should be same
df1 = df['Symbols'].head(6)
df2 = df['High'].head(3)
print(pd.concat([df1, df2],axis=1))
Join Types
df1 = print(df.ix[1:5,['Symbols','High']])
df2 = print(df.ix[1:5,['Symbols','LTP']])
print(df1.join(df2,on='Symbols',how='Left'))
Replace
Replace hunderd with five
- df.replace(100,'five')
replace 15 with first, 28 with second, 34 with third
- df.replace([15,28,34],['first','second','Third'])
Rename a column
- df.rename(columns={'old_name': 'new_name'})
df = pd.read_csv('sales.txt')
data.to_csv('static/sales.csv')
print(df)