Source code for musif.process.utils

import re
from logging.config import dictConfig
from typing import List

import pandas as pd
from pandas import DataFrame

from musif.config import (
    ENDSWITH,
    INSTRUMENTS_TO_DELETE,
    INSTRUMENTS_TO_KEEP,
    STARTSWITH,
)
from musif.extract.basic_modules.scoring.constants import (
    FAMILY_INSTRUMENTATION,
    FAMILY_SCORING,
)
from musif.extract.features.harmony.constants import (
    KEY_MODULATORY,
    KEY_PERCENTAGE,
    KEY_PREFIX,
    CHORD_prefix,
)
from musif.extract.features.melody.constants import TRIMMED_INTERVALLIC_MEAN
from musif.extract.features.prefix import get_part_prefix
from musif.extract.features.scale.constants import DEGREE_PREFIX
from musif.logs import pinfo

from .constants import voices_list_prefixes


def replace_nans(df):
    for col in df.columns:
        if (
            "Interval" in col
            or col.startswith("Key_")
            or col.startswith((CHORD_prefix, "Chords_", "Additions_", "Numerals_"))
            or col.endswith(("_DottedRhythm", "_DoubleDottedRhythm"))
            or ("_Degree" and TRIMMED_INTERVALLIC_MEAN and "_Dyn") in col
        ):
            df[col] = df[col].fillna("NA")


def join_part_degrees(
    total_degrees: List[str], part: str, df: DataFrame, sufix: str = ""
) -> None:
    part_degrees = [i for i in total_degrees if part in i]

    aug = [i for i in part_degrees if "#" in i]
    desc = [i for i in part_degrees if "b" in i and "bb" not in i]
    d_desc = [i for i in part_degrees if "bb" in i]
    d_asc = [i for i in part_degrees if "x" in i]

    pattern = "^" + part + "Degree" + "[0-9].*"
    degree_nat = [x for x in part_degrees if re.search(pattern, x)]
    degree_nonat = [i for i in part_degrees if i not in degree_nat]

    df[part + DEGREE_PREFIX + "_Asc" + sufix] = df[aug].sum(axis=1)
    df[part + DEGREE_PREFIX + "_Desc" + sufix] = df[desc].sum(axis=1)
    df[part + DEGREE_PREFIX + "_Dasc" + sufix] = df[d_asc].sum(axis=1)
    df[part + DEGREE_PREFIX + "_Ddesc" + sufix] = df[d_desc].sum(axis=1)
    df[part + DEGREE_PREFIX + "_Nat" + sufix] = df[degree_nat].sum(axis=1)
    df[part + DEGREE_PREFIX + "_Nonat" + sufix] = df[degree_nonat].sum(axis=1)


def log_errors_and_shape(
    composer_counter: list, novoices_counter: list, df: DataFrame
) -> None:
    pinfo(f"\nTotal files skipped by composer: {len(composer_counter)}")
    pinfo(str(composer_counter))
    pinfo(f"\nTotal files skipped by no-voices: { len(novoices_counter)}")
    pinfo(str(novoices_counter))
    # pinfo(f"\nTotal files skipped by duetos/trietos: {len(duetos_counter)}")
    # pinfo(str(duetos_counter))
    pinfo(f"\nFinal shape of the DataFrame: {df.shape[0]} rows, {df.shape[1]} features")


def _delete_columns(data: DataFrame, config: dictConfig) -> None:
    # pinfo("\nDeleting not useful columns...")
    to_delete = []
    instruments_to_keep = [get_part_prefix(i) for i in config[INSTRUMENTS_TO_KEEP]]
    for inst in config[INSTRUMENTS_TO_DELETE]:
        # for i in data.columns:
        #     if "Part" + inst + "_" in i
        part_prefix = "Part" + inst  # + "_"
        for col in data.columns:
            if part_prefix in col and all(
                inst not in col for inst in instruments_to_keep
            ):
                pass
                to_delete.append(col)
            else:
                pass
        # to_delete += [i for i in data.columns if part_prefix in i and instrument not in i for instrument in instruments_to_keep]

    to_delete += [i for i in data.columns if i.endswith(tuple(config[ENDSWITH]))]
    to_delete += [i for i in data.columns if i.startswith(tuple(config[STARTSWITH]))]
    to_delete += [
        col
        for col in data.columns
        if any(substring in col for substring in config["columns_contain"])
    ]
    to_delete += [
        col
        for col in data.columns
        if any(string == col for string in config["columns_match"])
    ]
    to_delete += [i for i in data.columns if i.startswith("Sound") and "Voice" not in i]

    to_delete += [FAMILY_INSTRUMENTATION, FAMILY_SCORING]

    # Remove empty voices
    to_delete += [
        col
        for col in data.columns
        if col.startswith(tuple(voices_list_prefixes))
        and all(data[col].isnull().values)
    ]

    # removing columns containing nans
    if config['delete_columns_with_nans']:
        th = config["max_nan_columns"] or 0.0
        idx = data.isna().sum(axis=0) / data.shape[0] > th
        to_delete += data.columns[idx].to_list()

    data.drop(columns=to_delete, inplace=True, errors="ignore")


def join_keys(df: DataFrame) -> None:
    key_SD = [
        i
        for i in [
            KEY_PREFIX + "IV" + KEY_PERCENTAGE,
            KEY_PREFIX + "II" + KEY_PERCENTAGE,
            KEY_PREFIX + "VI" + KEY_PERCENTAGE,
        ]
        if i in df
    ]
    key_sd = [
        i
        for i in [
            KEY_PREFIX + "iv" + KEY_PERCENTAGE,
            KEY_PREFIX + "ii" + KEY_PERCENTAGE,
        ]
        if i in df
    ]
    key_tonic = [
        i
        for i in [KEY_PREFIX + "I" + KEY_PERCENTAGE, KEY_PREFIX + "i" + KEY_PERCENTAGE]
        if i in df
    ]
    key_rel = [
        i
        for i in [
            KEY_PREFIX + "III" + KEY_PERCENTAGE,
            KEY_PREFIX + "vi" + KEY_PERCENTAGE,
        ]
        if i in df
    ]

    total_key = key_rel + key_tonic + key_sd + key_SD
    others_key = [
        i
        for i in df.columns
        if KEY_PREFIX in i and i not in total_key and KEY_MODULATORY not in i
    ]

    df[KEY_PREFIX + "SD" + KEY_PERCENTAGE] = df[key_SD].sum(axis=1)
    df[KEY_PREFIX + "sd" + KEY_PERCENTAGE] = df[key_sd].sum(axis=1)
    df[KEY_PREFIX + "SubD" + KEY_PERCENTAGE] = (
        df[KEY_PREFIX + "sd" + KEY_PERCENTAGE] + df[KEY_PREFIX + "SD" + KEY_PERCENTAGE]
    )
    df[KEY_PREFIX + "T" + KEY_PERCENTAGE] = df[key_tonic].sum(axis=1)
    df[KEY_PREFIX + "rel" + KEY_PERCENTAGE] = df[key_rel].sum(axis=1)
    df[KEY_PREFIX + "Other" + KEY_PERCENTAGE] = df[others_key].sum(axis=1)
    # df.drop(total_key + others_key, axis = 1, inplace=True)


def join_keys_modulatory(df: DataFrame):
    key_SD = [
        i
        for i in [
            KEY_PREFIX + KEY_MODULATORY + "IV",
            KEY_PREFIX + KEY_MODULATORY + "II",
            KEY_PREFIX + KEY_MODULATORY + "VI",
        ]
        if i in df
    ]
    key_sd = [
        i
        for i in [
            KEY_PREFIX + KEY_MODULATORY + "iv",
            KEY_PREFIX + KEY_MODULATORY + "ii",
        ]
        if i in df
    ]
    key_tonic = [
        i
        for i in [KEY_PREFIX + KEY_MODULATORY + "I", KEY_PREFIX + KEY_MODULATORY + "i"]
        if i in df
    ]
    key_rel = [
        i
        for i in [
            KEY_PREFIX + KEY_MODULATORY + "III",
            KEY_PREFIX + KEY_MODULATORY + "vi",
        ]
        if i in df
    ]

    total_key_mod = key_rel + key_tonic + key_sd + key_SD
    others_key_mod = [
        i
        for i in df.columns
        if KEY_PREFIX + KEY_MODULATORY in i and i not in total_key_mod
    ]

    df[KEY_PREFIX + KEY_MODULATORY + "SD"] = df[key_SD].sum(axis=1)
    df[KEY_PREFIX + KEY_MODULATORY + "sd"] = df[key_sd].sum(axis=1)
    df[KEY_PREFIX + KEY_MODULATORY + "SubD"] = (
        df[KEY_PREFIX + KEY_MODULATORY + "sd"] + df[KEY_PREFIX + KEY_MODULATORY + "SD"]
    )
    df[KEY_PREFIX + KEY_MODULATORY + "T"] = df[key_tonic].sum(axis=1)
    df[KEY_PREFIX + KEY_MODULATORY + "rel"] = df[key_rel].sum(axis=1)
    df[KEY_PREFIX + KEY_MODULATORY + "Other"] = df[others_key_mod].sum(axis=1)

def _drop_filenames_nan_rows(df):
    rows_with_nan_filename = list(df[df['FileName'].isna()]['FileName'].index)
    if len(rows_with_nan_filename)>0:
        print('There are som files with computation errors!')
        print(rows_with_nan_filename)
        df.dropna(subset=['FileName'], inplace=True)

[docs] def merge_dataframes(name: str, dest_path: str) -> None: """ Takes two dataframes and joins them, apart from deleting rows that are all nans. This is intended for cases where all extraction of a folder cannot be done all at once. Returns ------ Dataframe with the extracted features as a concatenation of two dataframes """ csv = ".csv" name1 = name + "_1" + csv name2 = name + "_2" + csv df1 = pd.read_csv(name1, low_memory=False) df2 = pd.read_csv(name2, low_memory=False) _drop_filenames_nan_rows(df1) _drop_filenames_nan_rows(df2) total_dataframe = pd.concat((df1, df2), axis=0) total_dataframe.to_csv(dest_path + csv, index=False)