如何(智能)循环遍历GeoDataframe中的所有点并查看最近的邻居-python技术脚本

How to (smartly) loop over all points in a GeoDataframe and look at nearest neighbours

我有一个大 (O(10^6) 行) 数据集(带有值的点)，我需要对所有点执行以下操作：

在预定义的半径内找到最近的 3 个点。
计算这三个点的关联值的平均值。
将平均值保存到我正在查看的点

"非矢量化"方法是简单地遍历所有点...对于所有点，然后应用逻辑。然而，这扩展性很差。

我已经包含了一个玩具示例，它可以满足我的需求。我已经考虑过的想法是：

使用 shapely.ops.nearest_points: 然而，这似乎只返回一个最近的点。
围绕每个单独的点进行缓冲并与原始 GeoDataframe 进行连接：这似乎比天真的方法更糟糕。

这是我要实现的逻辑的玩具示例：

import pandas as pd

import numpy as np

from shapely.wkt import loads

import geopandas as gp



points=[

  'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',

  'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',

  'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'

]

values=[9,8,7,6,5,4,3,2,1]



df=pd.DataFrame({'points':points,'values':values})

gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})



for index,row in gdf.iterrows(): # Looping over all points

  gdf['dist'] = np.nan

  for index2,row2 in gdf.iterrows(): # Looping over all the other points

    if index==index2: continue

    d=row['geometry'].distance(row2['geometry']) # Calculate distance

    if d<3: gdf.at[index2,'dist']=d # If within cutoff: Store

    else: gdf.at[index2,'dist']=np.nan # Otherwise, be paranoid and leave NAN

  # Calculating mean of values for the 3 nearest points and storing 

  gdf.at[index,'mean']=np.mean(gdf.sort_values('dist').head(3)['values'].tolist())



print(gdf)
     points values    geometry   dist   mean

0 POINT (1 1.1)    9 POINT (1 1.1) 2.758623 6.333333

1 POINT (1 1.9)    8 POINT (1 1.9) 2.282542 7.000000

2 POINT (1 3.1)    7 POINT (1 3.1) 2.002498 5.666667

3  POINT (2 1)    6  POINT (2 1) 2.236068 5.666667

4 POINT (2 2.1)    5 POINT (2 2.1) 1.345362 4.666667

5 POINT (2 2.9)    4 POINT (2 2.9) 1.004988 4.333333

6 POINT (3 0.8)    3 POINT (3 0.8) 2.200000 4.333333

7  POINT (3 2)    2  POINT (3 2) 1.000000 3.000000

8  POINT (3 3)    1  POINT (3 3)    NaN 3.666667

import pandas as pd

import numpy as np

from shapely.wkt import loads

import geopandas as gp

import libpysal



points=[

  'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',

  'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',

  'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'

]

values=[9,8,7,6,5,4,3,2,1]



df=pd.DataFrame({'points':points,'values':values})

gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})



knn3 = libpysal.weights.KNN.from_dataframe(gdf, k=3)



means = []

for index, row in gdf.iterrows(): # Looping over all points

  knn_neighbors = knn3.neighbors[index]

  knnsubset = gdf.iloc[knn_neighbors]

  neighbors = []

  for ix, r in knnsubset.iterrows():

    if r.geometry.distance(row.geometry) < 3: # max distance here

      neighbors.append(ix)



  subset = gdf.iloc[list(neighbors)]

  means.append(np.mean(subset['values']))

gdf['mean'] = means     points values    geometry   mean

0 POINT (1 1.1)    9 POINT (1 1.1) 6.333333

1 POINT (1 1.9)    8 POINT (1 1.9) 7.000000

2 POINT (1 3.1)    7 POINT (1 3.1) 5.666667

3  POINT (2 1)    6  POINT (2 1) 5.666667

4 POINT (2 2.1)    5 POINT (2 2.1) 4.666667

5 POINT (2 2.9)    4 POINT (2 2.9) 4.333333

6 POINT (3 0.8)    3 POINT (3 0.8) 4.333333

7  POINT (3 2)    2  POINT (3 2) 3.000000

8  POINT (3 3)    1  POINT (3 3) 3.666667

生成的 GeoDataframe 在这里：

import pandas as pd

import numpy as np

from shapely.wkt import loads

import geopandas as gp



points=[

  'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',

  'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',

  'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'

]

values=[9,8,7,6,5,4,3,2,1]



df=pd.DataFrame({'points':points,'values':values})

gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})



for index,row in gdf.iterrows(): # Looping over all points

  gdf['dist'] = np.nan

  for index2,row2 in gdf.iterrows(): # Looping over all the other points

    if index==index2: continue

    d=row['geometry'].distance(row2['geometry']) # Calculate distance

    if d<3: gdf.at[index2,'dist']=d # If within cutoff: Store

    else: gdf.at[index2,'dist']=np.nan # Otherwise, be paranoid and leave NAN

  # Calculating mean of values for the 3 nearest points and storing 

  gdf.at[index,'mean']=np.mean(gdf.sort_values('dist').head(3)['values'].tolist())



print(gdf)
     points values    geometry   dist   mean

0 POINT (1 1.1)    9 POINT (1 1.1) 2.758623 6.333333

1 POINT (1 1.9)    8 POINT (1 1.9) 2.282542 7.000000

2 POINT (1 3.1)    7 POINT (1 3.1) 2.002498 5.666667

3  POINT (2 1)    6  POINT (2 1) 2.236068 5.666667

4 POINT (2 2.1)    5 POINT (2 2.1) 1.345362 4.666667

5 POINT (2 2.9)    4 POINT (2 2.9) 1.004988 4.333333

6 POINT (3 0.8)    3 POINT (3 0.8) 2.200000 4.333333

7  POINT (3 2)    2  POINT (3 2) 1.000000 3.000000

8  POINT (3 3)    1  POINT (3 3)    NaN 3.666667

import pandas as pd

import numpy as np

from shapely.wkt import loads

import geopandas as gp

import libpysal



points=[

  'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',

  'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',

  'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'

]

values=[9,8,7,6,5,4,3,2,1]



df=pd.DataFrame({'points':points,'values':values})

gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})



knn3 = libpysal.weights.KNN.from_dataframe(gdf, k=3)



means = []

for index, row in gdf.iterrows(): # Looping over all points

  knn_neighbors = knn3.neighbors[index]

  knnsubset = gdf.iloc[knn_neighbors]

  neighbors = []

  for ix, r in knnsubset.iterrows():

    if r.geometry.distance(row.geometry) < 3: # max distance here

      neighbors.append(ix)



  subset = gdf.iloc[list(neighbors)]

  means.append(np.mean(subset['values']))

gdf['mean'] = means     points values    geometry   mean

0 POINT (1 1.1)    9 POINT (1 1.1) 6.333333

1 POINT (1 1.9)    8 POINT (1 1.9) 7.000000

2 POINT (1 3.1)    7 POINT (1 3.1) 5.666667

3  POINT (2 1)    6  POINT (2 1) 5.666667

4 POINT (2 2.1)    5 POINT (2 2.1) 4.666667

5 POINT (2 2.9)    4 POINT (2 2.9) 4.333333

6 POINT (3 0.8)    3 POINT (3 0.8) 4.333333

7  POINT (3 2)    2  POINT (3 2) 3.000000

8  POINT (3 3)    1  POINT (3 3) 3.666667

你可以看到最后一次迭代的状态。

除了在 NAN 留下的最终位置之外，所有距离都已计算。
最后一次迭代的平均值是三个最近点的平均值：2、4和5，即3,666667。

如何以更具可扩展性的方式做到这一点？

我会为此使用空间索引。您可以使用 libpysal 的功能，它在底层使用 KDTree。对于 2000 个随机点，以下代码运行时间为 3.5 秒，而您的代码运行时间很长(第一分钟后我失去了耐心)。将值保存到列表，然后将列表转换为 DF 的列也可以节省一些时间。

import pandas as pd

import numpy as np

from shapely.wkt import loads

import geopandas as gp



points=[

  'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',

  'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',

  'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'

]

values=[9,8,7,6,5,4,3,2,1]



df=pd.DataFrame({'points':points,'values':values})

gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})



for index,row in gdf.iterrows(): # Looping over all points

  gdf['dist'] = np.nan

  for index2,row2 in gdf.iterrows(): # Looping over all the other points

    if index==index2: continue

    d=row['geometry'].distance(row2['geometry']) # Calculate distance

    if d<3: gdf.at[index2,'dist']=d # If within cutoff: Store

    else: gdf.at[index2,'dist']=np.nan # Otherwise, be paranoid and leave NAN

  # Calculating mean of values for the 3 nearest points and storing 

  gdf.at[index,'mean']=np.mean(gdf.sort_values('dist').head(3)['values'].tolist())



print(gdf)
     points values    geometry   dist   mean

0 POINT (1 1.1)    9 POINT (1 1.1) 2.758623 6.333333

1 POINT (1 1.9)    8 POINT (1 1.9) 2.282542 7.000000

2 POINT (1 3.1)    7 POINT (1 3.1) 2.002498 5.666667

3  POINT (2 1)    6  POINT (2 1) 2.236068 5.666667

4 POINT (2 2.1)    5 POINT (2 2.1) 1.345362 4.666667

5 POINT (2 2.9)    4 POINT (2 2.9) 1.004988 4.333333

6 POINT (3 0.8)    3 POINT (3 0.8) 2.200000 4.333333

7  POINT (3 2)    2  POINT (3 2) 1.000000 3.000000

8  POINT (3 3)    1  POINT (3 3)    NaN 3.666667

import pandas as pd

import numpy as np

from shapely.wkt import loads

import geopandas as gp

import libpysal



points=[

  'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',

  'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',

  'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'

]

values=[9,8,7,6,5,4,3,2,1]



df=pd.DataFrame({'points':points,'values':values})

gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})



knn3 = libpysal.weights.KNN.from_dataframe(gdf, k=3)



means = []

for index, row in gdf.iterrows(): # Looping over all points

  knn_neighbors = knn3.neighbors[index]

  knnsubset = gdf.iloc[knn_neighbors]

  neighbors = []

  for ix, r in knnsubset.iterrows():

    if r.geometry.distance(row.geometry) < 3: # max distance here

      neighbors.append(ix)



  subset = gdf.iloc[list(neighbors)]

  means.append(np.mean(subset['values']))

gdf['mean'] = means     points values    geometry   mean

0 POINT (1 1.1)    9 POINT (1 1.1) 6.333333

1 POINT (1 1.9)    8 POINT (1 1.9) 7.000000

2 POINT (1 3.1)    7 POINT (1 3.1) 5.666667

3  POINT (2 1)    6  POINT (2 1) 5.666667

4 POINT (2 2.1)    5 POINT (2 2.1) 4.666667

5 POINT (2 2.9)    4 POINT (2 2.9) 4.333333

6 POINT (3 0.8)    3 POINT (3 0.8) 4.333333

7  POINT (3 2)    2  POINT (3 2) 3.000000

8  POINT (3 3)    1  POINT (3 3) 3.666667

这是结果：

import pandas as pd

import numpy as np

from shapely.wkt import loads

import geopandas as gp



points=[

  'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',

  'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',

  'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'

]

values=[9,8,7,6,5,4,3,2,1]



df=pd.DataFrame({'points':points,'values':values})

gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})



for index,row in gdf.iterrows(): # Looping over all points

  gdf['dist'] = np.nan

  for index2,row2 in gdf.iterrows(): # Looping over all the other points

    if index==index2: continue

    d=row['geometry'].distance(row2['geometry']) # Calculate distance

    if d<3: gdf.at[index2,'dist']=d # If within cutoff: Store

    else: gdf.at[index2,'dist']=np.nan # Otherwise, be paranoid and leave NAN

  # Calculating mean of values for the 3 nearest points and storing 

  gdf.at[index,'mean']=np.mean(gdf.sort_values('dist').head(3)['values'].tolist())



print(gdf)
     points values    geometry   dist   mean

0 POINT (1 1.1)    9 POINT (1 1.1) 2.758623 6.333333

1 POINT (1 1.9)    8 POINT (1 1.9) 2.282542 7.000000

2 POINT (1 3.1)    7 POINT (1 3.1) 2.002498 5.666667

3  POINT (2 1)    6  POINT (2 1) 2.236068 5.666667

4 POINT (2 2.1)    5 POINT (2 2.1) 1.345362 4.666667

5 POINT (2 2.9)    4 POINT (2 2.9) 1.004988 4.333333

6 POINT (3 0.8)    3 POINT (3 0.8) 2.200000 4.333333

7  POINT (3 2)    2  POINT (3 2) 1.000000 3.000000

8  POINT (3 3)    1  POINT (3 3)    NaN 3.666667

import pandas as pd

import numpy as np

from shapely.wkt import loads

import geopandas as gp

import libpysal



points=[

  'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',

  'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',

  'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'

]

values=[9,8,7,6,5,4,3,2,1]



df=pd.DataFrame({'points':points,'values':values})

gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})



knn3 = libpysal.weights.KNN.from_dataframe(gdf, k=3)



means = []

for index, row in gdf.iterrows(): # Looping over all points

  knn_neighbors = knn3.neighbors[index]

  knnsubset = gdf.iloc[knn_neighbors]

  neighbors = []

  for ix, r in knnsubset.iterrows():

    if r.geometry.distance(row.geometry) < 3: # max distance here

      neighbors.append(ix)



  subset = gdf.iloc[list(neighbors)]

  means.append(np.mean(subset['values']))

gdf['mean'] = means     points values    geometry   mean

0 POINT (1 1.1)    9 POINT (1 1.1) 6.333333

1 POINT (1 1.9)    8 POINT (1 1.9) 7.000000

2 POINT (1 3.1)    7 POINT (1 3.1) 5.666667

3  POINT (2 1)    6  POINT (2 1) 5.666667

4 POINT (2 2.1)    5 POINT (2 2.1) 4.666667

5 POINT (2 2.9)    4 POINT (2 2.9) 4.333333

6 POINT (3 0.8)    3 POINT (3 0.8) 4.333333

7  POINT (3 2)    2  POINT (3 2) 3.000000

8  POINT (3 3)    1  POINT (3 3) 3.666667

如何(智能)循环遍历GeoDataframe中的所有点并查看最近的邻居

How to (smartly) loop over all points in a GeoDataframe and look at nearest neighbours

相关推荐

Spring部署设置openshift

检查Java中正则表达式中模式的第n次出现

如何让 JTable 停留在已编辑的单元格上

Weblogic 12c 部署

Resteasy Content-Type 默认值

代码不会停止运行，在 Java 中

Out of memory java heap space

Log4j 记录到共享日志文件