eulerlab
/
s-on-alpha-regional


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
							import numpy as np
import pandas as pd

def swc_to_df(filepath):
    df_swc = pd.read_csv(filepath, delim_whitespace=True, comment='#',
                         names=['n', 'type', 'x', 'y', 'z', 'radius', 'parent'], index_col=False)
    df_swc.index = df_swc.n.values
    return df_swc


def read_swc(filepath):

    swc = swc_to_df(filepath)
    # raw data
    n = np.array(swc['n'].tolist())
    pos = np.array([swc['x'].tolist(), swc['y'].tolist(), swc['z'].tolist()]).T
    radius = np.array(swc['radius'].tolist())
    t = np.array(swc['type'].tolist())
    pid = np.array(swc['parent'].tolist())
    t[pid == -1] = 1  # if soma is missing, the first point is soma

    e = np.vstack(list(zip(pid[1:], n[1:])))
    e = remove_duplicate(pos, e)

    soma_loc = 1

    return {'n': n,
            'pos': pos,
            'radius': radius,
            't': t,
            'e': e,
            'soma_loc': soma_loc
            }


def read_imx(filepath):
    import numpy as np

    from bs4 import BeautifulSoup


    with open(filepath, 'rb') as f:
        xml_binary = f.read().decode('utf-8')

    soup = BeautifulSoup(xml_binary, 'lxml')

    vertex = soup.find('mfilamentgraphvertex').text
    vertex = vertex.replace(')', '')
    vertex = vertex.split('(')
    vertex = [i.strip() for i in vertex]
    vertex = [i for i in vertex if i != '']

    edges = soup.find('mfilamentgraphedge').text
    edges = edges.replace(')', '')
    edges = edges.split('(')
    edges = [i.strip() for i in edges]
    edges = [i for i in edges if i != '']

    vortex = np.array([list(map(float, sub.split(','))) for sub in vertex])
    # node position/coordinates(x,y,z)

    pos = vortex[:, :3]  # n->pos
    e = np.array([list(map(int, sub.split(','))) for sub in edges]) + 1
    e = remove_duplicate(pos, e)

    # node index
    n = np.arange(1, len(pos) + 1)  # n is the node index

    # radius
    radius = vortex[:, 3]

    # type
    t = np.ones(len(radius)) * 3
    soma_loc = int(soup.find('mfilamentgraph').attrs['mrootvertex']) + 1
    t[soma_loc - 1] = 1

    return {'n': n,
            'pos': pos,
            'radius': radius,
            't': t,
            'e': e,
            'soma_loc': soma_loc
            }


def remove_duplicate(pos, e):

    pos_uniq, pos_count = np.unique(pos, return_counts=True, axis=0)

    for duplicate_point in pos_uniq[pos_count > 1]:
        a, *rest = np.where((duplicate_point == pos).all(1))[0] + 1
        for dupl in rest:
            for i in [i for i in np.where(e[:, 0] == dupl)[0]]:
                e[i][0] = a
            for i in [i for i in np.where(e[:, 1] == dupl)[0]]:
                e[i][1] = a

    e[(e[:, 1] - e[:, 0]) < 0] = e[(e[:, 1] - e[:, 0]) < 0][:, ::-1]

    e = np.delete(e, np.where(e[:, 0] == e[:, 1])[0], axis=0)

    e, index = np.unique(e, axis=0, return_index=True)

    return e[index.argsort()]


def get_edge_dict(n, e, soma_loc):
    def _point_already_in_dict(point, edge_dict):

        for path_id, points_list in edge_dict.items():
            if point in points_list:
                return True
        else:
            return False

    edge_dict = {}
    path_id = 1

    branch_loc = [i for i in n if len(np.where(e[:, 0] == i)[0]) >= 2]
    if 1 not in branch_loc:
        branch_loc = [1] + branch_loc

    for bpt in branch_loc:

        bpt_locs_on_e = np.where(e[:, 0] == bpt)[0]

        for edge in e[bpt_locs_on_e]:
            current_point = edge[0]
            next_point = edge[1]

            a_list = [current_point]

            if _point_already_in_dict(next_point, edge_dict):
                pass
            else:
                a_list.append(next_point)  # a list for holding the index of point of one paths

            next_point_locs_on_e = np.where(e[:, 0] == next_point)[0]

            while len(next_point_locs_on_e) == 1:

                next_point = e[next_point_locs_on_e[0]][1]
                if _point_already_in_dict(next_point, edge_dict):
                    pass
                else:
                    a_list.append(next_point)  # a list for holding the index of point of one paths
                next_point_locs_on_e = np.where(e[:, 0] == next_point)[0]

            if len(a_list) < 2:
                pass
                continue

            edge_dict[path_id] = np.array(a_list)
            path_id += 1

    if soma_loc not in branch_loc:
        paths_soma_on = [key for key, value in edge_dict.items() if soma_loc in value]

        for path_id in paths_soma_on:
            path = edge_dict[path_id]
            breakup_point = np.where(edge_dict[path_id] == soma_loc)[0]
            path_0 = path[:breakup_point[0] + 1][::-1]
            path_1 = path[breakup_point[0]:]
            edge_dict[path_id] = path_0
            edge_dict[len(edge_dict)] = path_1

    return edge_dict


def get_path_dict(pos, radius, t, edge_dict, soma_loc):
    path_dict = {}
    radius_dict = {}
    type_dict = {}
    all_keys = edge_dict.keys()
    for key in all_keys:
        path_dict[key] = pos[edge_dict[key] - 1]
        radius_dict[key] = radius[edge_dict[key] - 1]
        type_dict[key] = np.unique(t[edge_dict[key] - 1][1:])[0]

    path_dict.update({0: pos[soma_loc - 1].reshape(1, 3)})
    radius_dict.update({0: [radius[soma_loc - 1]]})
    type_dict.update({0: 1})

    df_paths = pd.DataFrame()
    df_paths['type'] = pd.Series(type_dict)
    df_paths['path'] = pd.Series(path_dict)
    df_paths['radius'] = pd.Series(radius_dict)

    return df_paths


def sort_path_direction(df_paths):
    df_paths = df_paths.copy()
    soma = df_paths.loc[0].path.flatten()

    df_paths['connect_to'] = np.nan
    df_paths['connect_to_at'] = ''
    df_paths['connect_to_at'] = df_paths['connect_to_at'].apply(np.array)

    path_ids_head = df_paths[df_paths.path.apply(lambda x: (x[0] == soma).all())].index

    if len(path_ids_head) > 0:
        df_paths.loc[path_ids_head, 'connect_to'] = -1
        df_paths.loc[path_ids_head, 'connect_to_at'] = pd.Series({path_id: soma for path_id in path_ids_head})

    path_ids_tail = df_paths[df_paths.path.apply(lambda x: (x[-1] == soma).all())].index
    if len(path_ids_tail) > 0:
        df_paths.loc[path_ids_tail, 'path'] = df_paths.loc[path_ids_tail].path.apply(lambda x: x[::-1])
        df_paths.loc[path_ids_tail, 'connect_to'] = -1
        df_paths.loc[path_ids_tail, 'connect_to_at'] = pd.Series({path_id: soma for path_id in path_ids_tail})

    new_target_paths = list(df_paths[~np.isnan(df_paths.connect_to)].index)  # seed the first round of paths to check

    while np.count_nonzero(~np.isnan(df_paths.connect_to)) != len(df_paths):

        all_checked_paths = list(df_paths[~np.isnan(df_paths.connect_to)].index)
        num_check_paths_before = len(all_checked_paths)

        target_paths = new_target_paths
        new_target_paths = []  # empty the list to hold new target paths for next round

        for target_path_id in target_paths:

            if target_path_id == 0: continue

            target_path = df_paths.loc[target_path_id].path

            path_ids_head = df_paths[df_paths.path.apply(lambda x: (x[0] == target_path[-1]).all())].index.tolist()
            path_ids_head = [i for i in path_ids_head if i not in all_checked_paths]
            if len(path_ids_head) > 0:
                df_paths.loc[path_ids_head, 'connect_to'] = target_path_id
                df_paths.loc[path_ids_head, 'connect_to_at'] = pd.Series(
                    {path_id: target_path[-1] for path_id in path_ids_head})
                new_target_paths = new_target_paths + path_ids_head

            path_ids_tail = df_paths[df_paths.path.apply(lambda x: (x[-1] == target_path[-1]).all())].index.tolist()
            path_ids_tail = [i for i in path_ids_tail if i not in all_checked_paths]

            if len(path_ids_tail) > 0:
                df_paths.loc[path_ids_tail, 'path'] = df_paths.loc[path_ids_tail].path.apply(lambda x: x[::-1])
                df_paths.loc[path_ids_tail, 'connect_to'] = target_path_id
                df_paths.loc[path_ids_tail, 'connect_to_at'] = pd.Series(
                    {path_id: target_path[-1] for path_id in path_ids_tail})
                new_target_paths = new_target_paths + path_ids_tail


        num_check_paths_after = len(list(df_paths[~np.isnan(df_paths.connect_to)].index))

        if num_check_paths_before == num_check_paths_after:
            num_disconneted = len(df_paths) - num_check_paths_after
            break

    df_paths_drop = df_paths[np.isnan(df_paths.connect_to)]
    df_paths = df_paths.drop(df_paths[np.isnan(df_paths.connect_to)].index)

    return df_paths, df_paths_drop


def get_paths_nearest_to_tree(df_paths, df_drops, num_all_paths):
    """
    Get paths in df_drops which stay nearest to the connected paths.

    Paremeters
    ----------
    df_paths:
        DataFrame holding all connected paths

    df_drops:
        DataFrame holding all disconnected paths

    Return
    ------
    res: dict
        {'p0': {'path': path,
                'path_id': path_id_drop,
                'radius': radius,
                'type': t},
         'target': {'path': path_tree,
                     'path_id': path_id_tree,
                     'radius': radius_tree,
                     'type': type_tree
                     } }

    """

    res = []
    for row in df_drops.iterrows():
        path_id_drop = row[0]
        path = row[1].path

        path_id_tree = df_paths.path.apply(lambda x: np.sqrt(((x[-1] - path) ** 2).sum(1).min())).argmin()
        distance_arr = np.sqrt(np.sum((path - df_paths.loc[path_id_tree].path[-1]) ** 2, 1))

        loc_nearest = distance_arr.argmin()
        dist_nearest = distance_arr.min()

        res.append((path_id_drop, path_id_tree, loc_nearest, dist_nearest))

    res = np.array(res)
    nearest_paths_data = res[np.where(res[:, 3] == res[:, 3].min())[0]]

    ### get all paths data

    num_paths_to_tree = len(nearest_paths_data)

    path_id_tree = nearest_paths_data[0][1]
    path_tree = df_paths.loc[path_id_tree].path
    radius_tree = df_paths.loc[path_id_tree].radius
    type_tree = df_paths.loc[path_id_tree].type

    loc_nearest = int(nearest_paths_data[0][2])
    distance_between = nearest_paths_data[0][3]

    res = {}

    for i, datum in enumerate(nearest_paths_data):

        path_id_drop = datum[0]
        path = df_drops.loc[path_id_drop].path
        radius = df_drops.loc[path_id_drop].radius

        t = df_drops.loc[path_id_drop].type

        if loc_nearest == 0:
            path = path
            point_connect = path[0]
            radius_connect = radius[0]

            res['p{}'.format(i)] = {'path': path,
                                    'path_id': path_id_drop,
                                    'radius': radius,
                                    'type': t}
        elif loc_nearest == len(path) - 1:
            path = path[::-1]
            point_connect = path[0]
            radius_connect = radius[0]

            res['p{}'.format(i)] = {'path': path,
                                    'path_id': path_id_drop,
                                    'radius': radius,
                                    'type': t}
        else:
            p0 = path[:loc_nearest + 1][::-1]
            p1 = path[loc_nearest:]

            r0 = radius[:loc_nearest + 1]
            r1 = radius[loc_nearest:]

            point_connect = p0[0]
            radius_connect = r0[0]

            if len(p0) > 1:
                res['p0'.format(i)] = {'path': p0,
                                       'path_id': path_id_drop,
                                       'radius': r0,
                                       'type': t}
            if len(p1) > 1:
                res['p1'.format(i)] = {'path': p1,
                                       # 'path_id': len(df_paths) + len(df_drops),
                                       'path_id': num_all_paths,
                                       'radius': r1,
                                       'type': t}

    if distance_between > 0:
        path_tree = np.vstack([path_tree, point_connect])
        radius_tree = np.hstack([radius_tree, radius_connect])

    res['target'] = {'path': path_tree,
                     'path_id': path_id_tree,
                     'radius': radius_tree,
                     'type': type_tree
                     }

    return res


def reconnect_dropped_paths(df_paths, df_drops):

    df_paths = df_paths.copy()
    df_drops = df_drops.copy()
    num_dropped_paths = len(df_drops)

    while len(df_drops) > 0:
        num_all_paths = num_dropped_paths + len(df_paths)
        paths_data = get_paths_nearest_to_tree(df_paths, df_drops, num_all_paths)

        target = paths_data.pop('target')
        path_id_tree = target['path_id']
        path_tree = target['path']
        radius_tree = target['radius']
        tail_path_tree = path_tree[-1]

        df_paths.at[int(path_id_tree), 'path'] = path_tree
        df_paths.at[int(path_id_tree), 'radius'] = radius_tree

        if len(paths_data) == 1:

            p = paths_data.pop('p0')

            path_id_drop = p['path_id']
            path_drop = p['path']
            radius_drop = p['radius']
            t = p['type']
            df_paths.at[int(path_id_tree), 'path'] = np.vstack([path_tree[:-1], path_drop])
            df_paths.at[int(path_id_tree), 'radius'] = np.hstack([radius_tree[:-1], radius_drop])
            df_drops.drop(path_id_drop, inplace=True)

        else:
            for key, values in paths_data.items():

                p = paths_data[key]

                path_id_drop = p['path_id']
                path_drop = p['path']
                radius_drop = p['radius']
                t = p['type']
                df_paths.loc[int(path_id_drop)] = [t, path_drop, radius_drop, path_id_tree, tail_path_tree]

                try:
                    df_drops.drop(path_id_drop, inplace=True)
                except:
                    pass


    return df_paths.sort_index()


def find_connection(df_paths):
    # find all paths connect to current path.
    connected_by_dict = {}
    connected_by_at_dict = {}
    for path_id in df_paths.index:
        connected_by_dict[path_id] = df_paths[df_paths.connect_to == path_id].index.tolist()
        connected_by_at_dict[path_id] = df_paths[df_paths.connect_to == path_id].connect_to_at.tolist()
    df_paths['connected_by'] = pd.Series(connected_by_dict)
    df_paths['connected_by_at'] = pd.Series(connected_by_at_dict)

    back_to_soma_dict = {}
    for path_id in df_paths.index:
        list_to_soma = [path_id]
        next_path_id = df_paths.loc[path_id].connect_to
        while next_path_id != -1:
            list_to_soma.append(next_path_id)
            next_path_id = df_paths.loc[next_path_id].connect_to
        back_to_soma_dict[path_id] = list_to_soma
    df_paths['back_to_soma'] = pd.Series(back_to_soma_dict)


    return df_paths


def write_swc(df_paths):
    path_checked = []
    swc_arr = []

    list_back_to_soma = (df_paths.sort_values(['connect_to']).back_to_soma).tolist()
    for i, back_to_soma in enumerate(list_back_to_soma):

        for path_id in back_to_soma[::-1]:

            if path_id in path_checked:
                continue

            path_s = df_paths.loc[path_id]
            path = path_s['path']
            path_radius = path_s['radius']

            if len(path) > 1:
                path = path[1:]
                path_radius = path_s['radius'][1:]

            path_type = path_s['type']
            #
            connect_to = path_s['connect_to']
            connect_to_at = path_s['connect_to_at']

            swc_path = np.column_stack([np.ones(len(path)) * path_type, path])  # type
            swc_path = np.column_stack([np.arange(len(swc_arr) + 1, len(path) + len(swc_arr) + 1), swc_path])  # ID
            swc_path = np.column_stack([swc_path, path_radius * np.ones(len(path))])  # radius
            swc_path = np.column_stack([swc_path, swc_path[:, 0] - 1])  # placeholder for PID

            if len(swc_arr) == 0:
                swc_arr = swc_path
                swc_arr[0][-1] = -1

            else:
                pid = np.where((swc_arr[:, 2:5] == connect_to_at).all(1))[0] + 1
                #                 logging.info(pid)

                if len(pid) > 1:
                    swc_path[0][-1] = pid[0]
                #                     logging.info(swc_arr[pid[0]][2:5], swc_arr[pid[1]][2:5])
                elif len(pid) == 1:
                    swc_path[0][-1] = pid

                swc_arr = np.vstack([swc_arr, swc_path])

            path_checked.append(path_id)

    df_swc = pd.DataFrame(swc_arr)
    df_swc.index = np.arange(1, len(df_swc) + 1)
    df_swc.columns = ['n', 'type', 'x', 'y', 'z', 'radius', 'parent']
    df_swc[['n', 'type', 'parent']] = df_swc[['n', 'type', 'parent']].astype(int)

    return df_swc


def data_preprocessing(filepath):
    filetype = filepath.split('/')[-1].split('.')[-1].lower()
    filename = filepath.split('/')[-1].split('.')[0].lower()

    if filetype == 'swc':
        data = read_swc(filepath)
    elif filetype == 'imx':
        data = read_imx(filepath)
    else:
        raise NotImplementedError

    e = data['e']
    n = data['n']
    t = data['t']
    pos = data['pos']
    radius = data['radius']
    soma_loc = data['soma_loc']

    edge_dict = get_edge_dict(n, e, soma_loc)
    df_paths = get_path_dict(pos, radius, t, edge_dict, soma_loc)
    df_paths, df_paths_drop = sort_path_direction(df_paths)
    df_paths = reconnect_dropped_paths(df_paths, df_paths_drop)
    df_paths = find_connection(df_paths)
    df_swc = write_swc(df_paths)

    if (df_paths.iloc[1:].type == 0).all():
        type_list = np.ones(len(df_paths)).astype(int) * 3
        type_list[0] = 1
        df_paths = df_paths.assign(type=type_list)

    return df_swc, df_paths