python_catchup_2024/5_pandas.py

"""
I am not a expert on pandas and have had limited experience with it, but im going to cover
the very basics of it. Panda is similiar to a database (dont kill me) but is used mainly
for data analysis and manipulation. it mainly uses the CSV format, also used for spreedsheets.
(I honestly am too lazy to write out my own examples, thank chatGTP, but if you have any 
questions i will be able to answer them!)


Key Concepts:
    **DataFrame** is a 2-dimensional table with rows and columns, again the exact same as a spreadsheet.
    **Series** is a 1-dimensional array, like a single column of data.
"""

import pandas as pd

# 1. Creating a DataFrame from a Dictionary
data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [24, 27, 22],
    "City": ["New York", "Los Angeles", "Chicago"]
}
df = pd.DataFrame(data)
print("DataFrame:\n", df)

# 2. Selecting Columns and Rows
# Select a single column
print("\nNames column:\n", df["Name"])

# Select multiple columns
print("\nName and Age columns:\n", df[["Name", "Age"]])

# Select a row by index
print("\nFirst row:\n", df.iloc[0])

# Select rows with a condition
print("\nPeople older than 23:\n", df[df["Age"] > 23])

# 3. Data Exploration
print("\nDataFrame Info:")
print(df.info())

print("\nBasic Statistics:\n", df.describe())

# 4. Handling Missing Data
data_with_nan = {
    "Name": ["Alice", "Bob", None],
    "Age": [24, None, 22],
    "City": ["New York", "Los Angeles", None]
}
df_nan = pd.DataFrame(data_with_nan)
print("\nDataFrame with NaN:\n", df_nan)

# Fill missing values
print("\nFill NaN with 'Unknown':\n", df_nan.fillna("Unknown"))

# Drop rows with missing values
print("\nDrop rows with NaN:\n", df_nan.dropna())

# 5. Adding and Modifying Columns
# Add a new column
df["Salary"] = [70000, 80000, 50000]
print("\nDataFrame with Salary:\n", df)

# Update values based on a condition
df.loc[df["Age"] > 25, "Salary"] *= 1.10
print("\nUpdated Salary for Age > 25:\n", df)

# 6. Grouping and Aggregation
# Group by City and calculate the average Age
print("\nAverage Age by City:\n", df.groupby("City")["Age"].mean())

# 7. Reading from and Writing to CSV (uncomment to use)
# Read from a CSV
# Commented out as it doesnt exist...
# df_csv = pd.read_csv("path/to/file.csv")
# print("\nData from CSV:\n", df_csv)

# Write to a CSV
# Commented out as i dont want to actually save it.
#df.to_csv("path/to/save.csv", index=False)
finished pandas tutorial... 2024-11-11 22:30:06 +00:00			`"""`
			`I am not a expert on pandas and have had limited experience with it, but im going to cover`
			`the very basics of it. Panda is similiar to a database (dont kill me) but is used mainly`
			`for data analysis and manipulation. it mainly uses the CSV format, also used for spreedsheets.`
			`(I honestly am too lazy to write out my own examples, thank chatGTP, but if you have any`
			`questions i will be able to answer them!)`


			`Key Concepts:`
			`DataFrame is a 2-dimensional table with rows and columns, again the exact same as a spreadsheet.`
			`Series is a 1-dimensional array, like a single column of data.`
			`"""`

			`import pandas as pd`

			`# 1. Creating a DataFrame from a Dictionary`
			`data = {`
			`"Name": ["Alice", "Bob", "Charlie"],`
			`"Age": [24, 27, 22],`
			`"City": ["New York", "Los Angeles", "Chicago"]`
			`}`
			`df = pd.DataFrame(data)`
			`print("DataFrame:\n", df)`

			`# 2. Selecting Columns and Rows`
			`# Select a single column`
			`print("\nNames column:\n", df["Name"])`

			`# Select multiple columns`
			`print("\nName and Age columns:\n", df[["Name", "Age"]])`

			`# Select a row by index`
			`print("\nFirst row:\n", df.iloc[0])`

			`# Select rows with a condition`
			`print("\nPeople older than 23:\n", df[df["Age"] > 23])`

			`# 3. Data Exploration`
			`print("\nDataFrame Info:")`
			`print(df.info())`

			`print("\nBasic Statistics:\n", df.describe())`

			`# 4. Handling Missing Data`
			`data_with_nan = {`
			`"Name": ["Alice", "Bob", None],`
			`"Age": [24, None, 22],`
			`"City": ["New York", "Los Angeles", None]`
			`}`
			`df_nan = pd.DataFrame(data_with_nan)`
			`print("\nDataFrame with NaN:\n", df_nan)`

			`# Fill missing values`
			`print("\nFill NaN with 'Unknown':\n", df_nan.fillna("Unknown"))`

			`# Drop rows with missing values`
			`print("\nDrop rows with NaN:\n", df_nan.dropna())`

			`# 5. Adding and Modifying Columns`
			`# Add a new column`
			`df["Salary"] = [70000, 80000, 50000]`
			`print("\nDataFrame with Salary:\n", df)`

			`# Update values based on a condition`
			`df.loc[df["Age"] > 25, "Salary"] *= 1.10`
			`print("\nUpdated Salary for Age > 25:\n", df)`

			`# 6. Grouping and Aggregation`
			`# Group by City and calculate the average Age`
			`print("\nAverage Age by City:\n", df.groupby("City")["Age"].mean())`

			`# 7. Reading from and Writing to CSV (uncomment to use)`
			`# Read from a CSV`
			`# Commented out as it doesnt exist...`
			`# df_csv = pd.read_csv("path/to/file.csv")`
			`# print("\nData from CSV:\n", df_csv)`

			`# Write to a CSV`
			`# Commented out as i dont want to actually save it.`
			`#df.to_csv("path/to/save.csv", index=False)`