2024-11-11 22:30:06 +00:00
|
|
|
"""
|
|
|
|
I am not a expert on pandas and have had limited experience with it, but im going to cover
|
|
|
|
the very basics of it. Panda is similiar to a database (dont kill me) but is used mainly
|
|
|
|
for data analysis and manipulation. it mainly uses the CSV format, also used for spreedsheets.
|
|
|
|
(I honestly am too lazy to write out my own examples, thank chatGTP, but if you have any
|
|
|
|
questions i will be able to answer them!)
|
|
|
|
|
|
|
|
|
|
|
|
Key Concepts:
|
|
|
|
**DataFrame** is a 2-dimensional table with rows and columns, again the exact same as a spreadsheet.
|
|
|
|
**Series** is a 1-dimensional array, like a single column of data.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
# 1. Creating a DataFrame from a Dictionary
|
|
|
|
data = {
|
|
|
|
"Name": ["Alice", "Bob", "Charlie"],
|
|
|
|
"Age": [24, 27, 22],
|
|
|
|
"City": ["New York", "Los Angeles", "Chicago"]
|
|
|
|
}
|
|
|
|
df = pd.DataFrame(data)
|
|
|
|
print("DataFrame:\n", df)
|
|
|
|
|
|
|
|
# 2. Selecting Columns and Rows
|
|
|
|
# Select a single column
|
|
|
|
print("\nNames column:\n", df["Name"])
|
|
|
|
|
|
|
|
# Select multiple columns
|
|
|
|
print("\nName and Age columns:\n", df[["Name", "Age"]])
|
|
|
|
|
|
|
|
# Select a row by index
|
|
|
|
print("\nFirst row:\n", df.iloc[0])
|
|
|
|
|
|
|
|
# Select rows with a condition
|
|
|
|
print("\nPeople older than 23:\n", df[df["Age"] > 23])
|
|
|
|
|
|
|
|
# 3. Data Exploration
|
|
|
|
print("\nDataFrame Info:")
|
|
|
|
print(df.info())
|
|
|
|
|
|
|
|
print("\nBasic Statistics:\n", df.describe())
|
|
|
|
|
|
|
|
# 4. Handling Missing Data
|
|
|
|
data_with_nan = {
|
|
|
|
"Name": ["Alice", "Bob", None],
|
|
|
|
"Age": [24, None, 22],
|
|
|
|
"City": ["New York", "Los Angeles", None]
|
|
|
|
}
|
|
|
|
df_nan = pd.DataFrame(data_with_nan)
|
|
|
|
print("\nDataFrame with NaN:\n", df_nan)
|
|
|
|
|
|
|
|
# Fill missing values
|
|
|
|
print("\nFill NaN with 'Unknown':\n", df_nan.fillna("Unknown"))
|
|
|
|
|
|
|
|
# Drop rows with missing values
|
|
|
|
print("\nDrop rows with NaN:\n", df_nan.dropna())
|
|
|
|
|
|
|
|
# 5. Adding and Modifying Columns
|
|
|
|
# Add a new column
|
|
|
|
df["Salary"] = [70000, 80000, 50000]
|
|
|
|
print("\nDataFrame with Salary:\n", df)
|
|
|
|
|
|
|
|
# Update values based on a condition
|
|
|
|
df.loc[df["Age"] > 25, "Salary"] *= 1.10
|
|
|
|
print("\nUpdated Salary for Age > 25:\n", df)
|
|
|
|
|
|
|
|
# 6. Grouping and Aggregation
|
|
|
|
# Group by City and calculate the average Age
|
|
|
|
print("\nAverage Age by City:\n", df.groupby("City")["Age"].mean())
|
|
|
|
|
|
|
|
# 7. Reading from and Writing to CSV (uncomment to use)
|
|
|
|
# Read from a CSV
|
|
|
|
# Commented out as it doesnt exist...
|
|
|
|
# df_csv = pd.read_csv("path/to/file.csv")
|
|
|
|
# print("\nData from CSV:\n", df_csv)
|
|
|
|
|
|
|
|
# Write to a CSV
|
|
|
|
# Commented out as i dont want to actually save it.
|
|
|
|
#df.to_csv("path/to/save.csv", index=False)
|